In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
df.columns

Index([u'User_ID', u'Product_ID', u'Gender', u'Age', u'Occupation',
       u'City_Category', u'Stay_In_Current_City_Years', u'Marital_Status',
       u'Product_Category_1', u'Product_Category_2', u'Product_Category_3',
       u'Purchase'],
      dtype='object')

In [6]:
df.isnull().any()

User_ID                       False
Product_ID                    False
Gender                        False
Age                           False
Occupation                    False
City_Category                 False
Stay_In_Current_City_Years    False
Marital_Status                False
Product_Category_1            False
Product_Category_2             True
Product_Category_3             True
Purchase                      False
dtype: bool

In [5]:
def ImputeMissingValues(df,col_names,fill_with):
    df[col_names] = df[col_names].fillna(fill_with)
    return df

In [7]:
df = ImputeMissingValues(df,['Product_Category_2','Product_Category_3'],0)
df_test = ImputeMissingValues(df,['Product_Category_2','Product_Category_3'],0)

In [100]:
df_mod = df.drop(['User_ID','Product_ID'],axis=1)
df_mod.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,0-17,10,A,2,0,3,0.0,0.0,8370
1,0,0-17,10,A,2,0,1,6.0,14.0,15200
2,0,0-17,10,A,2,0,12,0.0,0.0,1422
3,0,0-17,10,A,2,0,12,14.0,0.0,1057
4,1,55+,16,C,4+,0,8,0.0,0.0,7969


In [101]:
def LabelEncode(df,col_name):
    le = LabelEncoder()
    df[col_name] = le.fit_transform(df[col_name])
    return df

In [102]:
df_mod = LabelEncode(df_mod,'Gender')
df_mod = LabelEncode(df_mod,'City_Category')
df_mod['Stay_In_Current_City_Years'] = np.where(df_mod['Stay_In_Current_City_Years'] == '4+',4,df_mod['Stay_In_Current_City_Years'])

In [103]:
def convertDummies(df,column_name):
    df_temp = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,df_temp],axis=1)
    df.drop(column_name,axis=1,inplace=True)
    return df

In [104]:
df_mod.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,0-17,10,0,2,0,3,0.0,0.0,8370
1,0,0-17,10,0,2,0,1,6.0,14.0,15200
2,0,0-17,10,0,2,0,12,0.0,0.0,1422
3,0,0-17,10,0,2,0,12,14.0,0.0,1057
4,1,55+,16,2,4,0,8,0.0,0.0,7969


In [105]:
df_mod = convertDummies(df_mod,'Gender')
df_mod = convertDummies(df_mod,'City_Category')
df_mod = convertDummies(df_mod,'Occupation')
df_mod = convertDummies(df_mod,'Age')
df_mod = convertDummies(df_mod,'Stay_In_Current_City_Years')
df_mod = convertDummies(df_mod,'Product_Category_1')

In [106]:
df_mod.columns

Index([u'Marital_Status', u'Product_Category_2', u'Product_Category_3',
       u'Purchase', u'Gender_0', u'Gender_1', u'City_Category_0',
       u'City_Category_1', u'City_Category_2', u'Occupation_0',
       u'Occupation_1', u'Occupation_2', u'Occupation_3', u'Occupation_4',
       u'Occupation_5', u'Occupation_6', u'Occupation_7', u'Occupation_8',
       u'Occupation_9', u'Occupation_10', u'Occupation_11', u'Occupation_12',
       u'Occupation_13', u'Occupation_14', u'Occupation_15', u'Occupation_16',
       u'Occupation_17', u'Occupation_18', u'Occupation_19', u'Occupation_20',
       u'Age_0-17', u'Age_18-25', u'Age_26-35', u'Age_36-45', u'Age_46-50',
       u'Age_51-55', u'Age_55+', u'Stay_In_Current_City_Years_4',
       u'Stay_In_Current_City_Years_0', u'Stay_In_Current_City_Years_1',
       u'Stay_In_Current_City_Years_2', u'Stay_In_Current_City_Years_3',
       u'Product_Category_1_1', u'Product_Category_1_2',
       u'Product_Category_1_3', u'Product_Category_1_4',
       u'P

In [108]:
df_mod.head()

Unnamed: 0,Marital_Status,Product_Category_2,Product_Category_3,Purchase,Gender_0,Gender_1,City_Category_0,City_Category_1,City_Category_2,Occupation_0,...,Product_Category_1_11,Product_Category_1_12,Product_Category_1_13,Product_Category_1_14,Product_Category_1_15,Product_Category_1_16,Product_Category_1_17,Product_Category_1_18,Product_Category_1_19,Product_Category_1_20
0,0,0.0,0.0,8370,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,6.0,14.0,15200,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.0,0.0,1422,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,14.0,0.0,1057,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0.0,0.0,7969,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [110]:
X = df_mod.drop(['Purchase'],axis=1)
y = df_mod['Purchase']

In [111]:
X.columns

Index([u'Marital_Status', u'Product_Category_2', u'Product_Category_3',
       u'Gender_0', u'Gender_1', u'City_Category_0', u'City_Category_1',
       u'City_Category_2', u'Occupation_0', u'Occupation_1', u'Occupation_2',
       u'Occupation_3', u'Occupation_4', u'Occupation_5', u'Occupation_6',
       u'Occupation_7', u'Occupation_8', u'Occupation_9', u'Occupation_10',
       u'Occupation_11', u'Occupation_12', u'Occupation_13', u'Occupation_14',
       u'Occupation_15', u'Occupation_16', u'Occupation_17', u'Occupation_18',
       u'Occupation_19', u'Occupation_20', u'Age_0-17', u'Age_18-25',
       u'Age_26-35', u'Age_36-45', u'Age_46-50', u'Age_51-55', u'Age_55+',
       u'Stay_In_Current_City_Years_4', u'Stay_In_Current_City_Years_0',
       u'Stay_In_Current_City_Years_1', u'Stay_In_Current_City_Years_2',
       u'Stay_In_Current_City_Years_3', u'Product_Category_1_1',
       u'Product_Category_1_2', u'Product_Category_1_3',
       u'Product_Category_1_4', u'Product_Category_1_5'

In [120]:
def runModel(model_name,X_train,X_test,y_train,y_test):
    model = model_name
    model.fit(X_train,y_train)
    ypred = model.predict(X_test)
    return r2_score(ypred,y_test),mean_squared_error(ypred,y_test)

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=9)

In [122]:
X_train.head().shape

(5, 61)

In [123]:
r2, mse = runModel(DecisionTreeRegressor(),X_train,X_test,y_train,y_test)

print 'R-Square: ',r2
print 'RMSE: ',np.sqrt(mse)

R-Square:  0.464068157705
RMSE:  3347.34609972


In [124]:
r2, mse = runModel(RandomForestRegressor(),X_train,X_test,y_train,y_test)

print 'R-Square: ',r2
print 'RMSE: ',np.sqrt(mse)

R-Square:  0.498971225776
RMSE:  3089.51947373


In [125]:
df.shape

(550068, 12)

In [126]:
X_train.shape

(385047, 61)

In [128]:
from sklearn.model_selection import cross_val_score
model = DecisionTreeRegressor()
scores = cross_val_score(model,X,y,scoring='neg_mean_squared_error',cv=5)
scores

array([-11161616.16859719, -11025364.94345075, -11022655.88929925,
       -11041797.71851856, -10938753.85554696])

In [130]:
scores = np.sqrt(scores * -1)
scores

array([ 3340.90050265,  3320.4464976 ,  3320.03853732,  3322.920059  ,
        3307.37869854])

In [131]:
scores.mean()

3322.336859022932