In [30]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [31]:
df = pd.read_csv("content_based_recommendation_dataset.csv")
df = df.drop_duplicates()

In [32]:
df.shape

(1018, 13)

In [33]:
df.columns

Index(['Number of clicks on similar products',
       'Number of similar products purchased so far',
       'Average rating given to similar products', 'Gender',
       'Median purchasing price (in rupees)', 'Rating of the product',
       'Brand of the product', 'Customer review sentiment score (overall)',
       'Price of the product', 'Holiday', 'Season', 'Geographical locations',
       'Probability for the product to be recommended to the person'],
      dtype='object')

In [34]:
df = df[["Number of similar products purchased so far" , 
         "Rating of the product" , "Gender" , "Median purchasing price (in rupees)" ,  
         "Price of the product" , "Holiday" , "Season" , "Probability for the product to be recommended to the person"]]

In [35]:
df.head()

Unnamed: 0,Number of similar products purchased so far,Rating of the product,Gender,Median purchasing price (in rupees),Price of the product,Holiday,Season,Probability for the product to be recommended to the person
0,4,4.5,male,500,200,No,winter,0.9
1,2,3.2,female,3000,300,Yes,monsoon,0.2
2,10,4.8,female,600,1000,No,spring,0.7
3,1,2.5,female,100,300,No,summer,0.1
4,5,4.3,male,2000,700,Yes,winter,0.8


In [36]:
df.isnull().sum()

Number of similar products purchased so far                    0
Rating of the product                                          0
Gender                                                         0
Median purchasing price (in rupees)                            0
Price of the product                                           0
Holiday                                                        0
Season                                                         0
Probability for the product to be recommended to the person    0
dtype: int64

In [37]:
# Label Encoding
replacement_mapping = {
    "male" : 1,
    "female" : 0,
    "Yes" : 1,
    "No" : 0
}

df["Gender"] = df["Gender"].replace(replacement_mapping)
df["Holiday"] = df["Holiday"].replace(replacement_mapping)

In [38]:
df.isnull().sum()

Number of similar products purchased so far                    0
Rating of the product                                          0
Gender                                                         0
Median purchasing price (in rupees)                            0
Price of the product                                           0
Holiday                                                        0
Season                                                         0
Probability for the product to be recommended to the person    0
dtype: int64

In [39]:
# One Hot Encoding on Season

one_hot_encoded = pd.get_dummies(df['Season'], prefix='')
one_hot_encoded = one_hot_encoded.astype(int)
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop("Season", axis=1, inplace=True)

In [40]:
df.isnull().sum()

Number of similar products purchased so far                    0
Rating of the product                                          0
Gender                                                         0
Median purchasing price (in rupees)                            0
Price of the product                                           0
Holiday                                                        0
Probability for the product to be recommended to the person    0
_monsoon                                                       0
_spring                                                        0
_summer                                                        0
_winter                                                        0
dtype: int64

In [41]:
df.reset_index(inplace = True)

In [42]:
from sklearn.preprocessing import StandardScaler

# Columns to be scaled
columns_to_scale = ['Median purchasing price (in rupees)', 'Price of the product']

scaler = StandardScaler()
for target_column in columns_to_scale:
    scaled_data = scaler.fit_transform(df[[target_column]])
    scaled_df = pd.DataFrame(scaled_data, columns=[target_column])
    scaled_df.reset_index(inplace = True)
    df[target_column] = scaled_df[target_column]

In [43]:
scaled_df["Price of the product"]

0      -0.533843
1      -0.486805
2      -0.157540
3      -0.486805
4      -0.298654
          ...   
1013   -0.486805
1014   -0.345692
1015   -0.204578
1016   -0.392730
1017   -0.345692
Name: Price of the product, Length: 1018, dtype: float64

In [44]:
df.isnull().sum()

index                                                          0
Number of similar products purchased so far                    0
Rating of the product                                          0
Gender                                                         0
Median purchasing price (in rupees)                            0
Price of the product                                           0
Holiday                                                        0
Probability for the product to be recommended to the person    0
_monsoon                                                       0
_spring                                                        0
_summer                                                        0
_winter                                                        0
dtype: int64

In [45]:
df.head()

Unnamed: 0,index,Number of similar products purchased so far,Rating of the product,Gender,Median purchasing price (in rupees),Price of the product,Holiday,Probability for the product to be recommended to the person,_monsoon,_spring,_summer,_winter
0,0,4,4.5,1,-0.780803,-0.533843,0,0.9,0,0,0,1
1,1,2,3.2,0,-0.018569,-0.486805,1,0.2,1,0,0,0
2,2,10,4.8,0,-0.750314,-0.15754,0,0.7,0,1,0,0
3,3,1,2.5,0,-0.902761,-0.486805,0,0.1,0,0,1,0
4,4,5,4.3,1,-0.323463,-0.298654,1,0.8,0,0,0,1


In [46]:
df.shape

(1018, 12)

In [47]:
df.isnull().sum()

index                                                          0
Number of similar products purchased so far                    0
Rating of the product                                          0
Gender                                                         0
Median purchasing price (in rupees)                            0
Price of the product                                           0
Holiday                                                        0
Probability for the product to be recommended to the person    0
_monsoon                                                       0
_spring                                                        0
_summer                                                        0
_winter                                                        0
dtype: int64

In [48]:
for column in df.columns:
    print(column)

index
Number of similar products purchased so far
Rating of the product
Gender
Median purchasing price (in rupees)
Price of the product
Holiday
Probability for the product to be recommended to the person
_monsoon
_spring
_summer
_winter


In [49]:
df.columns

Index(['index', 'Number of similar products purchased so far',
       'Rating of the product', 'Gender',
       'Median purchasing price (in rupees)', 'Price of the product',
       'Holiday',
       'Probability for the product to be recommended to the person',
       '_monsoon', '_spring', '_summer', '_winter'],
      dtype='object')

In [50]:
df.isnull().sum()

index                                                          0
Number of similar products purchased so far                    0
Rating of the product                                          0
Gender                                                         0
Median purchasing price (in rupees)                            0
Price of the product                                           0
Holiday                                                        0
Probability for the product to be recommended to the person    0
_monsoon                                                       0
_spring                                                        0
_summer                                                        0
_winter                                                        0
dtype: int64

In [51]:
x = df[['Number of similar products purchased so far', 'Rating of the product',
       'Gender', 'Holiday',
       '_monsoon', '_spring', '_summer', '_winter',
       'Median purchasing price (in rupees)', 'Price of the product']]
Y = df[['Probability for the product to be recommended to the person']]

In [52]:
from sklearn.model_selection import train_test_split

x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2, random_state=42)

In [53]:
import time
from sklearn.metrics import mean_squared_error

def evaluation(model, x_train, x_test, y_train, y_test):
    # Training data evaluation
    y_train_pred = model.predict(x_train)
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    train_mse = mean_squared_error(y_train, y_train_pred)
    
    # Testing data evaluation
    y_test_pred = model.predict(x_test)
    test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    # Inference time for a single row sample from x_train
    start_time = time.time()
    _ = model.predict([x_train.iloc[0]])
    end_time = time.time()
    inference_time = end_time - start_time
    
    return {
        'Training RMSE': train_rmse,
        'Training MSE': train_mse,
        'Testing RMSE': test_rmse,
        'Testing MSE': test_mse,
        'Inference Time': inference_time
    }

In [54]:
# Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(x_train, Y_train)

outputs = evaluation(model , x_train, x_test, Y_train, Y_test)
outputs

{'Training RMSE': 0.1288270765376349,
 'Training MSE': 0.016596415649233635,
 'Testing RMSE': 0.12318871651898092,
 'Testing MSE': 0.015175459877593844,
 'Inference Time': 0.0}

In [55]:
# Support Vector Regressor
from sklearn.svm import SVR

for type in ['linear' , 'poly' , 'rbf' , 'sigmoid']:
    print(type)
    model = SVR(kernel = type)
    model.fit(x_train, Y_train)

    outputs = evaluation(model , x_train, x_test, Y_train, Y_test)
    print(outputs)

linear
{'Training RMSE': 0.12919978660114836, 'Training MSE': 0.016692584857782276, 'Testing RMSE': 0.12385997559140319, 'Testing MSE': 0.015341293553502994, 'Inference Time': 0.0}
poly
{'Training RMSE': 0.13138242716646126, 'Training MSE': 0.0172613421681505, 'Testing RMSE': 0.126600582682952, 'Testing MSE': 0.01602770753566296, 'Inference Time': 0.0}
rbf
{'Training RMSE': 0.11874037651621795, 'Training MSE': 0.014099277015213204, 'Testing RMSE': 0.11847773374698899, 'Testing MSE': 0.014036973393822412, 'Inference Time': 0.0}
sigmoid
{'Training RMSE': 21.27648299991708, 'Training MSE': 452.6887288457604, 'Testing RMSE': 22.750477454012433, 'Testing MSE': 517.584224385528, 'Inference Time': 0.0}


In [56]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(x_train, Y_train)

outputs = evaluation(model , x_train, x_test, Y_train, Y_test)
outputs

{'Training RMSE': 0.036789484165126986,
 'Training MSE': 0.001353466145136129,
 'Testing RMSE': 0.08449280171912574,
 'Testing MSE': 0.007139033542347496,
 'Inference Time': 0.015626907348632812}