In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
black_df = pd.read_csv("BlackFridaySales.csv")

In [2]:
black_df

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,,,368
550064,1006035,P00375436,F,26-35,1,C,3,0,20,,,371
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,,,137
550066,1006038,P00375436,F,55+,1,C,2,0,20,,,365


In [3]:
black_df.drop(["User_ID", "Product_ID"], axis=1, inplace=True)

In [4]:
black_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      550068 non-null  object 
 1   Age                         550068 non-null  object 
 2   Occupation                  550068 non-null  int64  
 3   City_Category               550068 non-null  object 
 4   Stay_In_Current_City_Years  550068 non-null  object 
 5   Marital_Status              550068 non-null  int64  
 6   Product_Category_1          550068 non-null  int64  
 7   Product_Category_2          376430 non-null  float64
 8   Product_Category_3          166821 non-null  float64
 9   Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 42.0+ MB


In [5]:
data = {
    "attribute": ["Product_Category_2", "Product_Category_3"],
    "% of missing values": [round(black_df["Product_Category_2"].isnull().sum() / black_df.shape[0] * 100, 2),
                            round(black_df["Product_Category_3"].isnull().sum() / black_df.shape[0] * 100, 2)]
}
missing_values = pd.DataFrame(data=data)
missing_values

Unnamed: 0,attribute,% of missing values
0,Product_Category_2,31.57
1,Product_Category_3,69.67


In [6]:
black_df.drop("Product_Category_3", axis=1, inplace=True)

In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
product_Category_2_imputed = imputer.fit_transform(black_df[["Product_Category_2"]])
black_df["Product_Category_2"] = product_Category_2_imputed

In [8]:
q1 = black_df['Purchase'].quantile(0.25)
q3 = black_df['Purchase'].quantile(0.75)
iqr = q3 - q1

# Set the lower and upper bounds for outliers
low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr

# Remove outliers from the DataFrame
black_df = black_df.loc[(black_df['Purchase'] > low) & (black_df['Purchase'] < high)]

# Reset the index of the cleaned DataFrame
black_df.reset_index(drop=True, inplace=True)

In [9]:
black_df

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,F,0-17,10,A,2,0,3,9.0,8370
1,F,0-17,10,A,2,0,1,6.0,15200
2,F,0-17,10,A,2,0,12,9.0,1422
3,F,0-17,10,A,2,0,12,14.0,1057
4,M,55+,16,C,4+,0,8,9.0,7969
...,...,...,...,...,...,...,...,...,...
547386,M,51-55,13,B,1,1,20,9.0,368
547387,F,26-35,1,C,3,0,20,9.0,371
547388,F,26-35,15,B,4+,1,20,9.0,137
547389,F,55+,1,C,2,0,20,9.0,365


In [10]:
for feature in black_df:
    if len(black_df[feature].unique()) < 25:
        print(feature, black_df[feature].unique())

Gender ['F' 'M']
Age ['0-17' '55+' '26-35' '46-50' '51-55' '36-45' '18-25']
Occupation [10 16 15  7 20  9  1 12 17  0  3  4 11  8 19  2 18  5 14 13  6]
City_Category ['A' 'C' 'B']
Stay_In_Current_City_Years ['2' '4+' '3' '1' '0']
Marital_Status [0 1]
Product_Category_1 [ 3  1 12  8  5  4  2  6 14 11 13 15  7 16 18 10 17  9 20 19]
Product_Category_2 [ 9.  6. 14.  2.  8. 15. 16. 11.  5.  3.  4. 12. 10. 17. 13.  7. 18.]


In [11]:
ordered_attr = ["Age", "Stay_In_Current_City_Years"]
unordered_attr = ['Gender', 'Occupation', 'City_Category', 'Marital_Status', 'Product_Category_1', 'Product_Category_2']

In [12]:
from sklearn.preprocessing import OrdinalEncoder

# function to encode the ordered attributes in the Dataframe
def encodeOrderedAttributes(df, ordered_attr):
    # encode the attributes
    ord_encoder = OrdinalEncoder()
    cat_encoded = ord_encoder.fit_transform(df[ordered_attr])

    # return the new dataframe with encoded attributes
    return pd.DataFrame(cat_encoded, columns=ordered_attr)

# example
encodeOrderedAttributes(black_df, ordered_attr)

Unnamed: 0,Age,Stay_In_Current_City_Years
0,0.0,2.0
1,0.0,2.0
2,0.0,2.0
3,0.0,2.0
4,6.0,4.0
...,...,...
547386,5.0,1.0
547387,2.0,3.0
547388,2.0,4.0
547389,6.0,2.0


In [13]:
from sklearn.preprocessing import OneHotEncoder


# function to encode the uordered attributes in the Dataframe
def encodeUnorderedAttributes(df, unordered_attr):
    # encode the attributes
    hot_encoder = OneHotEncoder()
    cat_encoded = hot_encoder.fit_transform(df[unordered_attr])
    cat_encoded = cat_encoded.toarray()

    # return new dataframe with encoded attributes
    # create the new columns list
    columns = []
    for attribute, category in zip(unordered_attr, hot_encoder.categories_):
        attributeCategories = [f'{attribute}({str(cat)})' for cat in category]
        columns.extend(attributeCategories)

    # the dataframe
    return pd.DataFrame(data=cat_encoded, columns=columns)


# example
encodeUnorderedAttributes(black_df, unordered_attr)

Unnamed: 0,Gender(F),Gender(M),Occupation(0),Occupation(1),Occupation(2),Occupation(3),Occupation(4),Occupation(5),Occupation(6),Occupation(7),...,Product_Category_2(9.0),Product_Category_2(10.0),Product_Category_2(11.0),Product_Category_2(12.0),Product_Category_2(13.0),Product_Category_2(14.0),Product_Category_2(15.0),Product_Category_2(16.0),Product_Category_2(17.0),Product_Category_2(18.0)
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547386,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547387,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547388,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547389,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
def encode(df, ordered_attr, unordered_attr):
    ordered_cat = encodeOrderedAttributes(df, ordered_attr)
    unordered_cat = encodeUnorderedAttributes(df, unordered_attr)
    return pd.concat([ordered_cat, unordered_cat], axis=1)

# example
encode(black_df, ordered_attr, unordered_attr)

Unnamed: 0,Age,Stay_In_Current_City_Years,Gender(F),Gender(M),Occupation(0),Occupation(1),Occupation(2),Occupation(3),Occupation(4),Occupation(5),...,Product_Category_2(9.0),Product_Category_2(10.0),Product_Category_2(11.0),Product_Category_2(12.0),Product_Category_2(13.0),Product_Category_2(14.0),Product_Category_2(15.0),Product_Category_2(16.0),Product_Category_2(17.0),Product_Category_2(18.0)
0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,6.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547386,5.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547387,2.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547388,2.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547389,6.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df = encode(black_df, ordered_attr, unordered_attr)
df.head()

Unnamed: 0,Age,Stay_In_Current_City_Years,Gender(F),Gender(M),Occupation(0),Occupation(1),Occupation(2),Occupation(3),Occupation(4),Occupation(5),...,Product_Category_2(9.0),Product_Category_2(10.0),Product_Category_2(11.0),Product_Category_2(12.0),Product_Category_2(13.0),Product_Category_2(14.0),Product_Category_2(15.0),Product_Category_2(16.0),Product_Category_2(17.0),Product_Category_2(18.0)
0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,6.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
from sklearn.model_selection import train_test_split

X = df
y = black_df["Purchase"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((492651, 67), (54740, 67), (492651,), (54740,))

In [18]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
train_MSE = mean_squared_error(y_train, y_train_pred)
test_MSE = mean_squared_error(y_test, y_test_pred)

lr_train_RMSE = np.sqrt(train_MSE)
lr_test_RMSE = np.sqrt(test_MSE)


print(f'train RMSE: {lr_train_RMSE}\ntest RMSE: {lr_test_RMSE}')

train RMSE: 2984.644790900303
test RMSE: 2988.2143626283964


In [19]:
from sklearn.linear_model import Ridge, Lasso

ridge = Ridge()
lasso = Lasso()
ridge = Ridge()
ridge.fit(X_train, y_train)

y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)
ridge_train_RMSE = np.sqrt(mean_squared_error(y_train, y_train_pred))
ridge_test_RMSE = np.sqrt(mean_squared_error(y_test, y_test_pred))


print(f'train RMSE: {ridge_train_RMSE}\ntest RMSE: {ridge_test_RMSE}')

train RMSE: 2984.59347990721
test RMSE: 2988.2100628153257


In [20]:
lasso = Lasso()
lasso.fit(X_train, y_train)

y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)
lasso_train_RMSE = np.sqrt(mean_squared_error(y_train, y_train_pred))
lasso_test_RMSE = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f'train RMSE: {lasso_train_RMSE}\ntest RMSE: {lasso_test_RMSE}')

train RMSE: 2985.806714901427
test RMSE: 2989.896005879615


In [21]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
X_train_samples = X_train.sample(50000, random_state=42)
y_train_samples = y_train.sample(50000, random_state=42)
forest.fit(X_train_samples, y_train_samples)

y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
forest_train_RMSE = np.sqrt(mean_squared_error(y_train, y_train_pred))
forest_test_RMSE = np.sqrt(mean_squared_error(y_test, y_test_pred))


print(f'train RMSE: {forest_train_RMSE}\ntest RMSE: {forest_test_RMSE}')

train RMSE: 3078.637187755367
test RMSE: 3175.0118562247453


In [22]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
X_train_samples = X_train.sample(50000, random_state=42)
y_train_samples = y_train.sample(50000, random_state=42)
gbr.fit(X_train_samples, y_train_samples)

y_train_pred = gbr.predict(X_train)
y_test_pred = gbr.predict(X_test)
gbr_train_RMSE = np.sqrt(mean_squared_error(y_train, y_train_pred))
gbr_test_RMSE = np.sqrt(mean_squared_error(y_test, y_test_pred))


print(f'train RMSE: {gbr_train_RMSE}\ntest RMSE: {gbr_test_RMSE}')

train RMSE: 3011.5946286548287
test RMSE: 3017.992181009852


In [23]:
from xgboost import XGBRegressor

xgb = XGBRegressor(booster='gbtree', n_estimator=300, learning_rate=0.45, reg_lambda=1, reg_alpha=0.05)
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)
xgb_train_RMSE = np.sqrt(mean_squared_error(y_train, y_train_pred))
xgb_test_RMSE = np.sqrt(mean_squared_error(y_test, y_test_pred))


print(f'train RMSE: {xgb_train_RMSE}\ntest RMSE: {xgb_test_RMSE}')

Parameters: { "n_estimator" } are not used.

train RMSE: 2859.8959845762806
test RMSE: 2906.483405728298


In [24]:
models_evaluations = {
    'Train_RMSE': [lr_train_RMSE, ridge_train_RMSE, lasso_train_RMSE, forest_train_RMSE, gbr_train_RMSE, xgb_train_RMSE],
    'Test_RMSE': [lr_test_RMSE, ridge_test_RMSE, lasso_test_RMSE, forest_test_RMSE, gbr_test_RMSE, xgb_test_RMSE]
}
models = ['Linear regression', 'Ridge regression', 'Lasso Regression', 'Random forest', 'GradientBoosting regression', 'XGBoost regression']

models_comparaison_df = pd.DataFrame(data=models_evaluations, index=models)
models_comparaison_df

Unnamed: 0,Train_RMSE,Test_RMSE
Linear regression,2984.644791,2988.214363
Ridge regression,2984.59348,2988.210063
Lasso Regression,2985.806715,2989.896006
Random forest,3078.637188,3175.011856
GradientBoosting regression,3011.594629,3017.992181
XGBoost regression,2859.895985,2906.483406
