In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the dataset
df=pd.read_csv("train.csv")
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [3]:
# To check number of the duplicate records in the dataset
df.duplicated().sum()

0

In [4]:
# Dropping the duplicate records:
df=df.drop_duplicates(keep='first')

In [5]:
df.duplicated().sum()

0

In [6]:
# Splitting Dependent and Independent features:
target_column_name = 'Purchase'
drop_columns = [target_column_name,'User_ID','Product_ID']
X=df.drop(columns=drop_columns,axis=1)
y=df[target_column_name]

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 550068 entries, 0 to 550067
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      550068 non-null  object 
 1   Age                         550068 non-null  object 
 2   Occupation                  550068 non-null  int64  
 3   City_Category               550068 non-null  object 
 4   Stay_In_Current_City_Years  550068 non-null  object 
 5   Marital_Status              550068 non-null  int64  
 6   Product_Category_1          550068 non-null  int64  
 7   Product_Category_2          376430 non-null  float64
 8   Product_Category_3          166821 non-null  float64
dtypes: float64(2), int64(3), object(4)
memory usage: 42.0+ MB


In [8]:
X['Occupation']=X['Occupation'].astype('object')
X['Marital_Status']=X['Marital_Status'].astype('object')
X['Product_Category_1']=X['Product_Category_1'].astype('object')
X['Product_Category_2']=X['Product_Category_2'].astype('object')
X['Product_Category_3']=X['Product_Category_3'].astype('object')

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 550068 entries, 0 to 550067
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   Gender                      550068 non-null  object
 1   Age                         550068 non-null  object
 2   Occupation                  550068 non-null  object
 3   City_Category               550068 non-null  object
 4   Stay_In_Current_City_Years  550068 non-null  object
 5   Marital_Status              550068 non-null  object
 6   Product_Category_1          550068 non-null  object
 7   Product_Category_2          376430 non-null  object
 8   Product_Category_3          166821 non-null  object
dtypes: object(9)
memory usage: 42.0+ MB


In [10]:
# To check number of missing values in the dataset
X.isnull().sum()

Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
dtype: int64

In [11]:
X['Product_Category_2'].unique()

array([nan, 6.0, 14.0, 2.0, 8.0, 15.0, 16.0, 11.0, 5.0, 3.0, 4.0, 12.0,
       9.0, 10.0, 17.0, 13.0, 7.0, 18.0], dtype=object)

In [12]:
X['Product_Category_3'].unique()

array([nan, 14.0, 17.0, 5.0, 4.0, 16.0, 15.0, 8.0, 9.0, 13.0, 6.0, 12.0,
       3.0, 18.0, 11.0, 10.0], dtype=object)

In [13]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [14]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((412551, 9), (137517, 9), (412551,), (137517,))

In [15]:
#Pipeline Creation:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

# Define the categorical and numerical columns
categorical_cols = ['Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status','Product_Category_1','Product_Category_2','Product_Category_3']
# Define the custom ranking for each ordinal variable
age_categories = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']
occupation_categories = list(range(21))
city_categories = ['A', 'B', 'C']
stay_years_categories = ['0', '1', '2', '3', '4+']
marital_status_categories = [0, 1]
product_categories = list(range(1, 21))
product_categories.append(None)  # Adding 'nan' as a category

# Categorical Pipeline
cat_pipeline = Pipeline(
     steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[['M', 'F'], age_categories, occupation_categories, city_categories, stay_years_categories, marital_status_categories, product_categories, product_categories, product_categories])),
        ('scaler', StandardScaler())
      ]
  )

preprocessor = ColumnTransformer([
                ('cat_pipeline', cat_pipeline, categorical_cols)
            ])


In [16]:
# Data Transformation
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

# Model Training and Model Evaluation

In [17]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [18]:
linear=LinearRegression()
lasso=Lasso()
ridge=Ridge()
elastic=ElasticNet()
decisiontree=DecisionTreeRegressor()
randomforest=RandomForestRegressor()
adaboost=AdaBoostRegressor()
gboost=GradientBoostingRegressor()

In [19]:
import time
models=[linear,lasso,ridge,elastic,decisiontree,randomforest,adaboost,gboost]
result=pd.DataFrame(columns=['Model Name','RMSE'])

def build_and_evaluate_models(X_train,X_test,y_train,y_test,models):
  for model in models:
    start=time.time()
    trained_model=model.fit(X_train,y_train)
    end=time.time()
    fit_time=end-start

    start=time.time()
    y_pred=trained_model.predict(X_test)
    end=time.time()
    fit_time=end-start

    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mse)

    result.loc[len(result)]=[model.__class__.__name__,rmse]
  return result

result = build_and_evaluate_models(X_train,X_test,y_train,y_test,models)

In [20]:
result

Unnamed: 0,Model Name,RMSE
0,LinearRegression,4678.611633
1,Lasso,4678.610917
2,Ridge,4678.611617
3,ElasticNet,4708.826459
4,DecisionTreeRegressor,3325.699523
5,RandomForestRegressor,3054.349638
6,AdaBoostRegressor,3866.72123
7,GradientBoostingRegressor,2993.335865


# Hyperparameter Tuning Using GradientBoosting Regressor

In [21]:
# Hyperparamter Tuning Using # Hyperparameter Tuning Using GradientBoosting Regression:
# GridSearch CV
from sklearn.model_selection import GridSearchCV
parameter={
 'alpha':[0.01,1,10],
 'learning_rate':[0.01,1,10],
 'n_estimators':[100,200,300]
}

In [22]:
regressor=GradientBoostingRegressor()
reg=GridSearchCV(regressor,param_grid=parameter,cv=5,scoring='neg_root_mean_squared_error',verbose=3)
reg.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END alpha=0.01, learning_rate=0.01, n_estimators=100;, score=-3815.704 total time=  35.9s
[CV 2/5] END alpha=0.01, learning_rate=0.01, n_estimators=100;, score=-3812.913 total time=  36.7s
[CV 3/5] END alpha=0.01, learning_rate=0.01, n_estimators=100;, score=-3812.065 total time=  45.4s
[CV 4/5] END alpha=0.01, learning_rate=0.01, n_estimators=100;, score=-3817.248 total time=  37.0s
[CV 5/5] END alpha=0.01, learning_rate=0.01, n_estimators=100;, score=-3835.522 total time=  37.7s
[CV 1/5] END alpha=0.01, learning_rate=0.01, n_estimators=200;, score=-3396.726 total time= 1.2min
[CV 2/5] END alpha=0.01, learning_rate=0.01, n_estimators=200;, score=-3388.064 total time= 1.2min
[CV 3/5] END alpha=0.01, learning_rate=0.01, n_estimators=200;, score=-3453.835 total time= 1.2min
[CV 4/5] END alpha=0.01, learning_rate=0.01, n_estimators=200;, score=-3417.256 total time= 1.2min
[CV 5/5] END alpha=0.01, learning_rate=0.01, n_

In [23]:
reg.best_params_

{'alpha': 0.01, 'learning_rate': 1, 'n_estimators': 300}

In [24]:
reg.best_estimator_

In [25]:
reg.best_score_

-2908.3123485483243

In [28]:
reg.best_estimator_.predict([X_test[10]])

array([4074.45735966])

In [29]:
y_pred=reg.best_estimator_.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
rmse=np.sqrt(mse)

In [30]:
rmse

2924.1750358464274