In [None]:
#Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error

# Load Data
data=pd.read_csv("train.csv")


In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.head(10)

In [None]:
data.dtypes

In [None]:
data['User_ID'].nunique()

In [None]:
data['Product_ID'].nunique()

In [None]:
#%% Data visualizations

sns.countplot(data['Gender'])

In [None]:
sns.countplot(data['Age'], hue=data['Gender'])
# histograms


In [None]:
plt.figure(figsize=(15,10))
data.hist()
plt.show()

In [None]:
data['Occupation'].value_counts()

In [None]:
var = data.groupby(['Occupation','Gender']).Purchase.sum()
var.unstack().plot(kind='bar',stacked=True,color=['red','blue'])

In [None]:
var = data.groupby(['Age','Gender']).Purchase.sum()
var.unstack().plot(kind='bar',stacked=True,color=['red','blue'])

In [None]:
var = data.groupby(['City_Category','Age','Gender']).Purchase.sum()
var.unstack().plot(kind='bar',stacked=True,color=['red','blue'])

In [None]:
var = data.groupby(['City_Category']).sum().stack()
temp=var.unstack()
type(temp)
x_list=temp['Purchase']
label_list=temp.index
plt.axis("equal")

plt.pie(x_list,labels=label_list,autopct="%1.1f%%")
plt.title("Citywise Purchase")
plt.show()

In [None]:
var = data.groupby(['Product_Category_1','Product_Category_2']).Purchase.sum()
var.unstack().plot(kind='bar',stacked=True)

In [None]:
sns.heatmap(data.corr())

In [None]:
df=pd.get_dummies(data,columns=['Occupation','City_Category','Age','Stay_In_Current_City_Years','Gender'], drop_first=True)
df.head(5)

In [None]:
df=df.drop(['User_ID','Product_ID'], axis = 1 )
df = df.fillna(0)
df=df.astype(int)
df.head(5) # now all values except purchase and product categories are in range 0 - 1

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr())

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
df.dtypes

In [None]:
X=df.drop(columns='Purchase',axis=1)
Y=df['Purchase']/100000   # getting Y values also in the range 0 - 1
X_1, X_val, Y_1, Y_val = train_test_split(X, Y, test_size=0.25,random_state=8)
X_train, X_test, Y_train, Y_test = train_test_split(X_1, Y_1, test_size=0.3,random_state=8)


In [None]:
# %% Select scoring method
scoring='neg_root_mean_squared_error' # direct RMSE this may not work on some machines $1
scoring='neg_mean_squared_error'

In [None]:
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))

results = []
names = []
for name, model in models:
  kfold = KFold(n_splits=5, shuffle=True, random_state=5)
  cv_results = cross_val_score(model, X_train, Y_train, cv=kfold,scoring=scoring) \
  cv_results=np.sqrt(-cv_results) # $1 in case direct RMSE does not work use this 
  results.append(cv_results)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)

In [None]:
#%% Compare Algorithms


fig = plt.figure()
plt.figure(figsize=(10,7))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
#%% KNN Algorithm tuning

k_values = np.array([1,3,5,7,9,11,13,15,16,17,18,19,21])
param_grid = dict(n_neighbors=k_values)

model = KNeighborsRegressor()
kfold = KFold(n_splits=10, shuffle=True, random_state=5)

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold, iid=True)  
grid_result = grid.fit(X_train, Y_train)
# grid_result=np.sqrt(-grid_result)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
ensembles = []
ensembles.append(('AB', AdaBoostRegressor()))
ensembles.append(('GBM', GradientBoostingRegressor()))
ensembles.append(('RF', RandomForestRegressor()))
ensembles.append(('ET', ExtraTreesRegressor()))
results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=10, shuffle=True, random_state=5)
    cv_results = cross_val_score(model, X_train, Y_train,cv=kfold, scoring=scoring)
    cv_results=np.sqrt(-cv_results)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
#%% Compare Algorithms
fig = plt.figure()
plt.figure(figsize=(10,7))
fig.suptitle('Scaled Ensemble Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
#%% Tune scaled GBM
param_grid = dict(n_estimators=np.array([50,100,150,200,250,300,350,400,500,600,700,800]))
                                            
model = GradientBoostingRegressor(random_state=5)
kfold = KFold(n_splits=10, shuffle=True, random_state=5)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold, iid=True)
grid_result = grid.fit(X_train, Y_train)
grid_result=np.sqrt(-grid_result)
print("Best: %f using %s" % (grid_result.best_score_,grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))