In [1]:
import pandas as pd
sales = pd.read_csv('Black Friday/train.csv')
# sales.head()

### Data Preprocessing

In [2]:
sales.shape

(550068, 12)

In [3]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [4]:
import numpy as np
np.sort(sales.Product_Category_1.unique())

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20], dtype=int64)

In [5]:
np.sort(sales.Product_Category_2.unique())

array([ 2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
       15., 16., 17., 18., nan])

In [6]:
np.sort(sales.Product_Category_3.unique())

array([ 3.,  4.,  5.,  6.,  8.,  9., 10., 11., 12., 13., 14., 15., 16.,
       17., 18., nan])

In [7]:
df2 = sales.copy(deep=True)

In [8]:
df2.shape

(550068, 12)

In [9]:
df2['Product_Category_1'] = df2['Product_Category_1'].astype('str')
df2['Product_Category_2'] = df2['Product_Category_2'].astype('str')
df2['Product_Category_3'] = df2['Product_Category_3'].astype('str')

In [10]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   User_ID                     550068 non-null  int64 
 1   Product_ID                  550068 non-null  object
 2   Gender                      550068 non-null  object
 3   Age                         550068 non-null  object
 4   Occupation                  550068 non-null  int64 
 5   City_Category               550068 non-null  object
 6   Stay_In_Current_City_Years  550068 non-null  object
 7   Marital_Status              550068 non-null  int64 
 8   Product_Category_1          550068 non-null  object
 9   Product_Category_2          550068 non-null  object
 10  Product_Category_3          550068 non-null  object
 11  Purchase                    550068 non-null  int64 
dtypes: int64(4), object(8)
memory usage: 50.4+ MB


In [16]:
from sklearn.impute import SimpleImputer
simpImp = SimpleImputer(strategy='constant', fill_value='unknown')

df2[['Product_Category_2', 'Product_Category_3']] = simpImp.fit_transform(df2[['Product_Category_2', 'Product_Category_3']])

In [17]:
X, y = df2.drop(columns=['User_ID', 'Product_ID', 'Purchase']), df2['Purchase']

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore').set_output(transform='pandas')

ct = make_column_transformer(
        (ohe, make_column_selector(dtype_include=object)),
        ('passthrough', make_column_selector(dtype_exclude=object)),
        verbose_feature_names_out=False).set_output(transform='pandas')

X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [20]:
X_train.shape

(385047, 66)

### XGBRegressor

In [26]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
xgbr = XGBRegressor(random_state=24)
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
r2_score(y_test, y_pred)

0.6696193075028694

### LightRegressor

In [27]:
from lightgbm  import LGBMRegressor
lgbmr = LGBMRegressor(random_state=24)
lgbmr.fit(X_train, y_train)
y_pred = lgbmr.predict(X_test)
r2_score(y_test, y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 151
[LightGBM] [Info] Number of data points in the train set: 385047, number of used features: 66
[LightGBM] [Info] Start training from score 9261.376115


0.6667137526960094

### CatBoost

In [33]:
from catboost import CatBoostRegressor
cbr = CatBoostRegressor(random_state=24)
cbr.fit(X_train, y_train)
y_pred = cbr.predict(X_test)
r2_score(y_test, y_pred)

Learning rate set to 0.10488
0:	learn: 4761.5497120	total: 216ms	remaining: 3m 35s
1:	learn: 4541.9642367	total: 241ms	remaining: 2m
2:	learn: 4355.4050321	total: 265ms	remaining: 1m 27s
3:	learn: 4199.1302019	total: 289ms	remaining: 1m 11s
4:	learn: 4067.3160291	total: 313ms	remaining: 1m 2s
5:	learn: 3955.9924044	total: 336ms	remaining: 55.7s
6:	learn: 3861.4557199	total: 360ms	remaining: 51.1s
7:	learn: 3782.2383557	total: 383ms	remaining: 47.4s
8:	learn: 3716.6909875	total: 406ms	remaining: 44.7s
9:	learn: 3649.0446807	total: 430ms	remaining: 42.5s
10:	learn: 3598.9693886	total: 455ms	remaining: 40.9s
11:	learn: 3542.3729183	total: 483ms	remaining: 39.7s
12:	learn: 3500.1213197	total: 505ms	remaining: 38.3s
13:	learn: 3465.1645588	total: 528ms	remaining: 37.2s
14:	learn: 3422.2842843	total: 554ms	remaining: 36.4s
15:	learn: 3388.0612558	total: 578ms	remaining: 35.6s
16:	learn: 3361.3200190	total: 601ms	remaining: 34.7s
17:	learn: 3336.5950831	total: 623ms	remaining: 34s
18:	learn: 

0.6737123946086798

### Inferencing

In [34]:
X_train, y_train = ct.fit_transform(X), y
X_test = pd.read_csv('Black Friday/test.csv')
X_test.fillna(0, inplace=True)

In [35]:
X_test['Product_Category_1'] = X_test['Product_Category_1'].astype('str')
X_test['Product_Category_2'] = X_test['Product_Category_2'].astype('str')
X_test['Product_Category_3'] = X_test['Product_Category_3'].astype('str')

simpImp = SimpleImputer(strategy='constant', fill_value='unknown')
X_test[['Product_Category_2', 'Product_Category_3']] = simpImp.fit_transform(X_test[['Product_Category_2', 'Product_Category_3']])

X_test = ct.transform(X_test)



In [39]:
cbr.fit(X_train, y_train)
y_pred = cbr.predict(X_test)

Learning rate set to 0.11096
0:	learn: 4747.7179154	total: 37.1ms	remaining: 37.1s
1:	learn: 4518.2263793	total: 71.8ms	remaining: 35.8s
2:	learn: 4325.0207346	total: 105ms	remaining: 34.9s
3:	learn: 4165.3308675	total: 139ms	remaining: 34.6s
4:	learn: 4031.9104739	total: 178ms	remaining: 35.4s
5:	learn: 3919.6258418	total: 218ms	remaining: 36.1s
6:	learn: 3833.4604018	total: 252ms	remaining: 35.8s
7:	learn: 3746.0601510	total: 286ms	remaining: 35.5s
8:	learn: 3679.8606414	total: 319ms	remaining: 35.2s
9:	learn: 3625.1671941	total: 352ms	remaining: 34.8s
10:	learn: 3574.4270545	total: 384ms	remaining: 34.5s
11:	learn: 3517.1459798	total: 424ms	remaining: 34.9s
12:	learn: 3469.5995998	total: 456ms	remaining: 34.6s
13:	learn: 3433.1099393	total: 485ms	remaining: 34.2s
14:	learn: 3397.7793420	total: 518ms	remaining: 34s
15:	learn: 3370.9289225	total: 548ms	remaining: 33.7s
16:	learn: 3342.8655922	total: 581ms	remaining: 33.6s
17:	learn: 3317.0456116	total: 613ms	remaining: 33.4s
18:	learn

In [41]:
submit = pd.read_csv('Black Friday/sample_submission.csv')
submit.head()

Unnamed: 0,Purchase,User_ID,Product_ID
0,100,1000004,P00128942
1,100,1000009,P00113442
2,100,1000010,P00288442
3,100,1000010,P00145342
4,100,1000011,P00053842


In [42]:
y_pred.shape

(233599,)

In [43]:
submit['Purchase'] = y_pred

In [44]:
submit.to_csv('submit.csv', index=False)

In [46]:
submit.shape

(233599, 3)