## Data Preprocessing

In [1]:
#Importing the Required Packages
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
import pickle as pkl

In [2]:
#Reading the Data
df=pd.read_excel('amazon_sales_cleaned.xlsx')
df

Unnamed: 0,Category,Title,Price,Rating,Review Count
0,Electronics,Apple AirTag,9.77,4.6,384
1,Electronics,USB C Charger iPhone Charger Fast Charging 3Pa...,9.99,4.7,218
2,Electronics,20W USB C Fast Charger Compatible with iPhone ...,7.99,4.4,1890
3,Electronics,SAMSUNG Galaxy Tab A9+ Tablet 11” 64GB Android...,169.99,4.6,3866
4,Electronics,"iPhone Charger Cable,3Pack (MFi Certified) Dat...",19.77,4.7,139
...,...,...,...,...,...
523,Art,Prismacolor,53.39,4.6,517
524,Art,The Subtle Art of Not Giving a F*ck: A Counter...,0.00,4.5,146310
525,Art,"Goodyking Paint Your Own Cat Lamp Kit, DIY Cat...",9.99,4.5,1105
526,Art,The Arts: A Visual Encyclopedia (DK Children's...,16.10,4.8,1400


In [3]:
df = df.drop(['Title'],axis=1)

In [4]:
#Encoding the Categorical Data
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['Category'] = encoder.fit_transform(df['Category'])
df

Unnamed: 0,Category,Price,Rating,Review Count
0,5,9.77,4.6,384
1,5,9.99,4.7,218
2,5,7.99,4.4,1890
3,5,169.99,4.6,3866
4,5,19.77,4.7,139
...,...,...,...,...
523,1,53.39,4.6,517
524,1,0.00,4.5,146310
525,1,9.99,4.5,1105
526,1,16.10,4.8,1400


## Splitting the Data and Pipeline Creation

In [5]:
x=df.drop(['Price'],axis=1)
y=df['Price']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

In [6]:
#Using GradientBoosting Regressor
Pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('regressor',GradientBoostingRegressor())
])

## Training and Testing the Model

In [7]:
Pipeline.fit(x_train,y_train)

In [8]:
predicted_values = Pipeline.predict(x_test)
print(predicted_values)

[  6.0821415   44.76621553  25.7333832   56.22437968 409.46253388
  53.82166677 -11.37653098 122.91778615  43.58930427  32.2359814
  35.17305697  32.48342379  19.89469995  65.72064032  35.05314607
  50.05008147  32.2896838   12.18763002  24.83098026  22.46131417
  78.83498088  89.41418208  18.67874214  28.71517768 108.53936983
 111.35929346   5.24275213  43.22873706  36.7449537   57.96381318
  51.98846167  95.67982271  24.59609156   6.1721084  163.01519585
  73.67318312  70.1598919   62.2139208   56.22437968  86.23270845
  38.52937864  36.44726877  45.13777588 111.16814567  47.21897298
 104.57609325  32.67375243 161.30994622  29.55810791  37.06961926
  36.7449537   50.1081287   31.49238509  25.45591016  78.40272879
 192.97060604  40.46100042  35.17305697  26.33060525  10.51076603
  30.53563149  63.60831057 178.2169095   55.77401654  33.28463477
  42.15583937  62.32781948  21.98368466 112.49360007  31.95725894
  31.53104234  38.46835892  26.56546262 166.91965558  31.43915049
  34.058597

In [9]:
print('Model Accuracy =',Pipeline.score(x_test,y_test))

Model Accuracy = 0.3187936704571873


In [10]:
data = pd.DataFrame([[5,4.7,218]],columns=['Category','Rating','Review Count'])
predicted = Pipeline.predict(data)
print('Price =',predicted)

Price = [61.93171683]


## Saving the Model in a Pickle File

In [11]:
with open('model_pipeline.pkl','wb') as file:
    pkl.dump(Pipeline,file)