In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
# Load the dataset
df = pd.read_csv('BlackFridaySales.csv')

In [3]:
# Data Preprocessing
# Encode categorical columns
le = LabelEncoder()
df['Age'] = le.fit_transform(df['Age'])
df['Gender'] = le.fit_transform(df['Gender'])
df['City_Category'] = le.fit_transform(df['City_Category'])

In [4]:
# Convert categorical variable State_In_Current_Years into dummy/indicator variables
df = pd.get_dummies(df, columns=['Stay_In_Current_City_Years'])

In [5]:
# Fill missing values in Product_Category_2 and Product_Category_3
df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0], inplace=True)
df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0], inplace=True)

In [6]:
# Split dataset into training and testing sets
X = df.drop(['Purchase'], axis=1)
y = df['Purchase']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Modeling Phase
# Linear Regression
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
y_train_num = y_train

lr_model = LinearRegression()
lr_model.fit(X_train_num, y_train_num)
y_pred_lr = lr_model.predict(X_test.select_dtypes(include=['int64', 'float64']))
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
print("RMSE (Linear Regression): ", rmse_lr)

RMSE (Linear Regression):  4691.664962947095


In [10]:
# Decision Tree Regressor
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
y_train_num = y_train

dt_model = DecisionTreeRegressor()
dt_model.fit(X_train_num, y_train_num)
y_pred_dt = dt_model.predict(X_test.select_dtypes(include=['int64', 'float64']))
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
print("RMSE (Decision Tree Regressor): ", rmse_dt)

RMSE (Decision Tree Regressor):  3414.440506170272


In [12]:
# Random Forest Regressor
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
y_train_num = y_train

rf_model = RandomForestRegressor()
rf_model.fit(X_train_num, y_train_num)
y_pred_rf = rf_model.predict(X_test.select_dtypes(include=['int64', 'float64']))
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("RMSE (Random Forest Regressor): ", rmse_rf)

RMSE (Random Forest Regressor):  3037.3780795970165


In [14]:
# XGBOOST Regressor
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
y_train_num = y_train

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train_num, y_train_num)
y_pred_xgb = xgb_model.predict(X_test.select_dtypes(include=['int64', 'float64']))
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print("RMSE (XGBOOST Regressor): ", rmse_xgb)

RMSE (XGBOOST Regressor):  2877.37991364064


In [15]:
# Conclusion
print("Best performer: XGBOOST Regressor with RMSE score of ", rmse_xgb)

Best performer: XGBOOST Regressor with RMSE score of  2877.37991364064
