In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import math
from sklearn.impute import SimpleImputer
from scipy import stats
%matplotlib inline
plt.rcParams['figure.figsize'] = (8,6)

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#### Data Preprocessing

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [4]:
train.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [5]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [6]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [7]:
def randomly_pick(data):
    container = []
    for i in data:
        data = re.findall(re.compile("\d+"), i)
        if len(data) > 1:
            if int(data[0]) != 0:
                rang = round(np.random.choice(np.linspace(int(data[0]),int(data[1]),4)))
                container.append(int(rang))
            else:
                container.append(int(data[1]))
        else:
            container.append(int(data[0]))
    return container

In [8]:
def check_null(cols, df):
    for i in cols:
        print(f"{i} --> {df[i].isnull().any(axis=0)} : total == {df[i].isnull().sum()}")

In [9]:
def remove_symbol(df):
    mode = re.compile("\d+")
    for i in range(0, len(df)):
        df.loc[i, "Stay_In_Current_City_Years"] = int(re.findall(mode, df.loc[i, "Stay_In_Current_City_Years"])[0])

In [10]:
def change_null(df):
    return df.replace(np.nan,0)
def fill_null(df, col):
    imputer = SimpleImputer(strategy="most_frequent")
    return imputer.fit_transform(df[col].values.reshape(-1,1))

In [11]:
check_null([*train.columns], train)

User_ID --> False : total == 0
Product_ID --> False : total == 0
Gender --> False : total == 0
Age --> False : total == 0
Occupation --> False : total == 0
City_Category --> False : total == 0
Stay_In_Current_City_Years --> False : total == 0
Marital_Status --> False : total == 0
Product_Category_1 --> False : total == 0
Product_Category_2 --> True : total == 173638
Product_Category_3 --> True : total == 383247
Purchase --> False : total == 0


In [12]:
check_null([*test.columns], test)

User_ID --> False : total == 0
Product_ID --> False : total == 0
Gender --> False : total == 0
Age --> False : total == 0
Occupation --> False : total == 0
City_Category --> False : total == 0
Stay_In_Current_City_Years --> False : total == 0
Marital_Status --> False : total == 0
Product_Category_1 --> False : total == 0
Product_Category_2 --> True : total == 72344
Product_Category_3 --> True : total == 162562


In [13]:
# train = change_null(train)
# test = change_null(test)
train["Product_Category_2"] = fill_null(train, "Product_Category_2")
train["Product_Category_3"] = fill_null(train, "Product_Category_3")
test["Product_Category_2"] = fill_null(test, "Product_Category_2")
test["Product_Category_3"] = fill_null(test, "Product_Category_3")

In [14]:
from sklearn.preprocessing import LabelEncoder
age_encoder = LabelEncoder()
train["Age"] = age_encoder.fit_transform(train["Age"])
test["Age"] = age_encoder.transform(test["Age"])

In [15]:
train.drop(["User_ID","Product_ID"], axis=1, inplace=True)
test.drop(["User_ID","Product_ID"], axis=1, inplace=True)

In [16]:
remove_symbol(train)
remove_symbol(test)

In [17]:
train["Stay_In_Current_City_Years"] = train["Stay_In_Current_City_Years"].astype(np.int32)
test["Stay_In_Current_City_Years"] = test["Stay_In_Current_City_Years"].astype(np.int32)

In [18]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
le = LabelEncoder()
train["Gender"] = le.fit_transform(train["Gender"])
test["Gender"] = le.transform(test["Gender"])
ct1 = make_column_transformer((OneHotEncoder(),["City_Category"]))
train_dummpy = ct1.fit_transform(train)
test_dummy = ct1.transform(test)

In [19]:
train_dummpy = pd.DataFrame(train_dummpy, columns=[*ct1.get_feature_names_out()])
test_dummy   = pd.DataFrame(test_dummy, columns=[*ct1.get_feature_names_out()])
train = pd.concat([train, train_dummpy], axis=1)
test  = pd.concat([test, test_dummy], axis=1)

In [20]:
# dummy variables trap 
train.drop(["City_Category","onehotencoder__City_Category_A"],axis=1,inplace=True)
test.drop(["City_Category","onehotencoder__City_Category_A"],axis=1,inplace=True)

In [21]:
X_train = train.drop("Purchase", axis=1)
y_train = train["Purchase"]
X_train = X_train.iloc[0:len(X_train)//4,:]
y_train = y_train[0:len(y_train)//4]
test = test.iloc[0:len(test)//4,:]

In [22]:
X_train.head()

Unnamed: 0,Gender,Age,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,onehotencoder__City_Category_B,onehotencoder__City_Category_C
0,0,0,10,2,0,3,8.0,16.0,0.0,0.0
1,0,0,10,2,0,1,6.0,14.0,0.0,0.0
2,0,0,10,2,0,12,8.0,16.0,0.0,0.0
3,0,0,10,2,0,12,14.0,16.0,0.0,0.0
4,1,6,16,4,0,8,8.0,16.0,0.0,1.0


In [23]:
test.head()

Unnamed: 0,Gender,Age,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,onehotencoder__City_Category_B,onehotencoder__City_Category_C
0,1,4,7,2,1,1,11.0,16.0,1.0,0.0
1,1,2,17,0,0,3,5.0,16.0,0.0,1.0
2,0,3,1,4,1,5,14.0,16.0,1.0,0.0
3,0,3,1,4,1,4,9.0,16.0,1.0,0.0
4,0,2,1,1,0,4,5.0,12.0,0.0,1.0


#### LinearRegression 

In [24]:
from sklearn.linear_model import LinearRegression
mlr_model = LinearRegression()
mlr_model.fit(X_train,y_train)

LinearRegression()

In [25]:
# Train accuracy
print(f"Score : {mlr_model.score(X_train,y_train)}")

Score : 0.11468494678835883


In [26]:
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred = mlr_model.predict(X_train)
print(f"MSE :{mean_squared_error(y_train, y_train_pred)}")
r2 = r2_score(y_train, y_train_pred)
adjusted_r2 = 1 - (1-r2)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
print(f"R-squired : {r2}")
print(f"Adjusted R-squired : {adjusted_r2}")

MSE :21967087.27141294
R-squired : 0.11468494678835883
Adjusted R-squired : 0.11462056304850665


In [39]:
x_test_pred = mlr_model.predict(test)
x_test_pred

array([10932.19529982, 10527.63340667,  8752.44483435, ...,
       10009.34260556,  8541.65613366, 11112.08794598])

#### SVR

In [29]:
from sklearn.preprocessing import StandardScaler
scale_X = StandardScaler()
scale_y = StandardScaler()
X_train_scaled = scale_X.fit_transform(X_train)
test_scaled = scale_X.transform(test)
y_train_scaled = scale_y.fit_transform(y_train.values.reshape(-1,1))

In [30]:
# from sklearn.svm import SVR
# svr_model = SVR(kernel='rbf')
# svr_model.fit(X_train,y_train)

#### DT

In [31]:
from sklearn.tree import DecisionTreeRegressor
dr_model = DecisionTreeRegressor()
dr_model.fit(X_train,y_train)

DecisionTreeRegressor()

In [32]:
# Train accuracy
print(f"Score : {dr_model.score(X_train,y_train)}")

Score : 0.8417434420454459


In [33]:
y_train_pred_dt = dr_model.predict(X_train)
print(f"MSE :{mean_squared_error(y_train, y_train_pred_dt)}")
r2DT = r2_score(y_train, y_train_pred_dt)
adjusted_r2DT = 1 - (1-r2DT)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
print(f"R-squired : {r2DT}")
print(f"Adjusted R-squired : {adjusted_r2DT}")

MSE :3926777.9388249484
R-squired : 0.8417434420454459
Adjusted R-squired : 0.8417319329798085


In [34]:
X_test_pred_dt = dr_model.predict(test)
X_test_pred_dt

array([12126.        ,  5803.        ,  8169.        , ...,
       13609.16666667,  5344.6031746 , 15414.        ])

#### RFR

In [35]:
from sklearn.ensemble import RandomForestRegressor
rfr_model = RandomForestRegressor(n_estimators=100)
rfr_model.fit(X_train,y_train)

RandomForestRegressor()

In [36]:
# Train accuracy
print(f"Score : {rfr_model.score(X_train,y_train)}")

Score : 0.8222819814119554


In [37]:
y_train_pred_rfr = rfr_model.predict(X_train)
print(f"MSE :{mean_squared_error(y_train, y_train_pred_rfr)}")
r2RFR = r2_score(y_train, y_train_pred_rfr)
adjusted_r2RFR = 1 - (1-r2RFR)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
print(f"R-squired : {r2RFR}")
print(f"Adjusted R-squired : {adjusted_r2RFR}")

MSE :4409669.992466393
R-squired : 0.8222819814119554
Adjusted R-squired : 0.822269057029122


In [40]:
X_test_pred_rfr = rfr_model.predict(test)
X_test_pred_rfr

array([15416.22630159, 10797.6105    ,  8128.51661555, ...,
       13439.06292569,  5360.64263139, 13896.988     ])