In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [3]:
print df_test.shape
print df.shape

(233599, 11)
(550068, 12)


In [4]:
def ImputeMissingValues(df,col_names,fill_with):
    df[col_names] = df[col_names].fillna(fill_with)
    return df

In [5]:
df = ImputeMissingValues(df,['Product_Category_2','Product_Category_3'],0)
df_test = ImputeMissingValues(df_test,['Product_Category_2','Product_Category_3'],0)

In [6]:
df_test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,0.0
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,0.0
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,0.0
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,0.0
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [7]:
df_mod = df.drop(['User_ID','Product_ID'],axis=1)
df_test_mod = df_test.drop(['User_ID','Product_ID'],axis=1)

#df_mod.head()

In [8]:
def LabelEncode(df,col_name):
    le = LabelEncoder()
    df[col_name] = le.fit_transform(df[col_name])
    return df

In [9]:
df_mod = LabelEncode(df_mod,'Gender')
df_mod = LabelEncode(df_mod,'City_Category')
df_mod['Stay_In_Current_City_Years'] = np.where(df_mod['Stay_In_Current_City_Years'] == '4+',4,df_mod['Stay_In_Current_City_Years'])


df_test_mod = LabelEncode(df_test_mod,'Gender')
df_test_mod = LabelEncode(df_test_mod,'City_Category')
df_test_mod['Stay_In_Current_City_Years'] = np.where(df_test_mod['Stay_In_Current_City_Years'] == '4+',4,df_test_mod['Stay_In_Current_City_Years'])


In [10]:
def convertDummies(df,column_name):
    df_temp = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,df_temp],axis=1)
    df.drop(column_name,axis=1,inplace=True)
    return df

In [11]:
df_mod.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,0-17,10,0,2,0,3,0.0,0.0,8370
1,0,0-17,10,0,2,0,1,6.0,14.0,15200
2,0,0-17,10,0,2,0,12,0.0,0.0,1422
3,0,0-17,10,0,2,0,12,14.0,0.0,1057
4,1,55+,16,2,4,0,8,0.0,0.0,7969


In [12]:
#df_mod = convertDummies(df_mod,'Gender')
#df_mod = convertDummies(df_mod,'City_Category')
df_mod = convertDummies(df_mod,'Occupation')
df_mod = convertDummies(df_mod,'Age')
#df_mod = convertDummies(df_mod,'Stay_In_Current_City_Years')
df_mod = convertDummies(df_mod,'Product_Category_1')
df_mod = convertDummies(df_mod,'Product_Category_2')
df_mod = convertDummies(df_mod,'Product_Category_3')

In [13]:
#df_test_mod = convertDummies(df_test_mod,'Gender')
#df_test_mod = convertDummies(df_test_mod,'City_Category')
df_test_mod = convertDummies(df_test_mod,'Occupation')
df_test_mod = convertDummies(df_test_mod,'Age')
#df_test_mod = convertDummies(df_test_mod,'Stay_In_Current_City_Years')
df_test_mod = convertDummies(df_test_mod,'Product_Category_1')
df_test_mod = convertDummies(df_test_mod,'Product_Category_2')
df_test_mod = convertDummies(df_test_mod,'Product_Category_3')

In [14]:
df_test_mod.head()

Unnamed: 0,Gender,City_Category,Stay_In_Current_City_Years,Marital_Status,Occupation_0,Occupation_1,Occupation_2,Occupation_3,Occupation_4,Occupation_5,...,Product_Category_3_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0
0,1,1,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [15]:
df_test_mod['Product_Category_1_19'] = 0
df_test_mod['Product_Category_1_20'] = 0

In [16]:
print len(df_mod.columns)
print len(df_test_mod.columns)

87
86


In [17]:
X_train = df_mod.drop(['Purchase'],axis=1)
y_train = df_mod['Purchase']

In [18]:
X_test = df_test_mod
#.drop(['Purchase'],axis=1)
#y_test = df_test_mod['Purchase']

In [19]:
def runModel(model_name,X_train,X_test,y_train):
    model = model_name
    model.fit(X_train,y_train)
    ypred = model.predict(X_test)
    return ypred
    #return r2_score(ypred,y_test),mean_squared_error(ypred,y_test)

In [20]:
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=9)

In [21]:
ypred = runModel(RandomForestRegressor(),X_train,X_test,y_train)
ypred

array([ 17503.45833333,  11248.1       ,   7654.06666667, ...,
        13779.06      ,  18906.7       ,   2471.66666667])

In [22]:
#r2, mse = runModel(DecisionTreeRegressor(),X_train,X_test,y_train,y_test)

#print 'R-Square: ',r2
#print 'RMSE: ',np.sqrt(mse)

In [23]:
#r2, mse = runModel(RandomForestRegressor(),X_train,X_test,y_train,y_test)

#print 'R-Square: ',r2
#print 'RMSE: ',np.sqrt(mse)

In [24]:
from sklearn.model_selection import cross_val_score
model = DecisionTreeRegressor()
scores = cross_val_score(RandomForestRegressor(),X,y,scoring='neg_mean_squared_error',cv=5)
scores

NameError: name 'X' is not defined

In [None]:
scores = np.sqrt(scores * -1)
scores.mean()

In [25]:
df_test['Purchase'] = ypred

In [26]:
df_output = df_test[['User_ID', 'Product_ID','Purchase']]

In [27]:
df_output.to_csv('submission_2.csv')

In [None]:
#Target is to make error under 2400