In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import math
%matplotlib inline
plt.rcParams['figure.figsize'] = (8,6)

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#### Data Preprocessing

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [4]:
train.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [5]:
def ranomly_pick(data):
    container = []
    for i in data:
        data = re.findall(re.compile("\d+"), i)
        if len(data) > 1:
            if int(data[0]) != 0:
                rang = round(np.random.choice(np.linspace(int(data[0]),int(data[1]),4)))
                container.append(int(rang))
            else:
                container.append(int(data[1]))
        else:
            container.append(int(data[0]))
    return container

In [6]:
def check_null(cols, df):
    for i in cols:
        print(f"{i} --> {df[i].isnull().any(axis=0)} : total == {df[i].isnull().sum()}")

In [7]:
def remove_symbol(df):
    mode = re.compile("\d+")
    for i in range(0, len(df)):
        df.loc[i, "Stay_In_Current_City_Years"] = int(re.findall(mode, df.loc[i, "Stay_In_Current_City_Years"])[0])

In [8]:
check_null([*train.columns], train)

User_ID --> False : total == 0
Product_ID --> False : total == 0
Gender --> False : total == 0
Age --> False : total == 0
Occupation --> False : total == 0
City_Category --> False : total == 0
Stay_In_Current_City_Years --> False : total == 0
Marital_Status --> False : total == 0
Product_Category_1 --> False : total == 0
Product_Category_2 --> True : total == 173638
Product_Category_3 --> True : total == 383247
Purchase --> False : total == 0


In [9]:
check_null([*test.columns], test)

User_ID --> False : total == 0
Product_ID --> False : total == 0
Gender --> False : total == 0
Age --> False : total == 0
Occupation --> False : total == 0
City_Category --> False : total == 0
Stay_In_Current_City_Years --> False : total == 0
Marital_Status --> False : total == 0
Product_Category_1 --> False : total == 0
Product_Category_2 --> True : total == 72344
Product_Category_3 --> True : total == 162562


In [10]:
def change_null(df):
    return df.replace(np.nan,0)

In [11]:
train = change_null(train)
test = change_null(test)

In [12]:
train_age = ranomly_pick(train["Age"])
test_age = ranomly_pick(test["Age"])
train.insert(4, "Ages", train_age)
test.insert(4, "Ages", test_age)

In [13]:
train.drop(["User_ID","Product_ID","Age"], axis=1, inplace=True)
test.drop(["User_ID","Product_ID","Age"], axis=1, inplace=True)

In [14]:
remove_symbol(train)
remove_symbol(test)

In [15]:
train["Stay_In_Current_City_Years"] = train["Stay_In_Current_City_Years"].astype(np.int32)
test["Stay_In_Current_City_Years"] = test["Stay_In_Current_City_Years"].astype(np.int32)

In [16]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
le = LabelEncoder()
train["Gender"] = le.fit_transform(train["Gender"])
test["Gender"] = le.transform(test["Gender"])
ct1 = make_column_transformer((OneHotEncoder(),["City_Category"]))
train_dummpy = ct1.fit_transform(train)
test_dummy = ct1.transform(test)

In [17]:
train_dummpy = pd.DataFrame(train_dummpy, columns=[*ct1.get_feature_names_out()])
test_dummy   = pd.DataFrame(test_dummy, columns=[*ct1.get_feature_names_out()])
train = pd.concat([train, train_dummpy], axis=1)
test  = pd.concat([test, test_dummy], axis=1)

In [18]:
# dummy variables trap 
train.drop(["City_Category","onehotencoder__City_Category_A"],axis=1,inplace=True)
test.drop(["City_Category","onehotencoder__City_Category_A"],axis=1,inplace=True)

In [23]:
X_train = train.drop("Purchase", axis=1)
y_train = train["Purchase"]

In [20]:
train.head()

Unnamed: 0,Gender,Ages,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,onehotencoder__City_Category_B,onehotencoder__City_Category_C
0,0,17,10,2,0,3,0.0,0.0,8370,0.0,0.0
1,0,17,10,2,0,1,6.0,14.0,15200,0.0,0.0
2,0,17,10,2,0,12,0.0,0.0,1422,0.0,0.0
3,0,17,10,2,0,12,14.0,0.0,1057,0.0,0.0
4,1,55,16,4,0,8,0.0,0.0,7969,0.0,1.0


In [21]:
test.head()

Unnamed: 0,Gender,Ages,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,onehotencoder__City_Category_B,onehotencoder__City_Category_C
0,1,49,7,2,1,1,11.0,0.0,1.0,0.0
1,1,32,17,0,0,3,5.0,0.0,0.0,1.0
2,0,42,1,4,1,5,14.0,0.0,1.0,0.0
3,0,36,1,4,1,4,9.0,0.0,1.0,0.0
4,0,32,1,1,0,4,5.0,12.0,0.0,1.0


#### MLR 

In [24]:
from sklearn.linear_model import LinearRegression
mlr_model = LinearRegression()
mlr_model.fit(X_train,y_train)

In [25]:
# Train accuracy
print(f"Score :{mlr_model.score(X_train,y_train)}")

Score :0.15238424253709326


#### SVR

#### RFR 