### Importing Libs & Data

In [87]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import GaussianNoise
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.regularizers import l1
from keras.regularizers import l2
from keras.callbacks import ReduceLROnPlateau
from keras.layers import BatchNormalization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from plotnine import *
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from math import sqrt

In [4]:
XtrainLR = pd.read_pickle('XtrainLR');Ytrain = pd.read_pickle('Ytrain');XtestLR = pd.read_pickle('XtestLR')
XvalLR = pd.read_pickle('XvalLR');Yval = pd.read_pickle('Yval')

In [63]:
XtrainRF = pd.read_pickle('XtrainRF'); XtestRF = pd.read_pickle('XtestRF'); XvalRF = pd.read_pickle('XvalLR')

### Scaling numerical features of all datasets

In [5]:
XtrainLR[['purchase_amount', 'Cat_1_Y',\
       'Cat2Tot1', 'Cat2Tot2', 'Cat2Tot3', 'Cat2Tot4', 'Cat3TotA', 'Cat3TotB',\
       'AvgPurAmt', 'TimeSpent', 'CLV', 'numerical_2', 'avg_sales_lag3',\
       'avg_purchases_lag3', 'avg_purchases_lag6', 'active_months_lag6',\
       'avg_sales_lag12']]=preprocessing.scale(np.asarray(XtrainLR[['purchase_amount', 'Cat_1_Y',\
       'Cat2Tot1', 'Cat2Tot2', 'Cat2Tot3', 'Cat2Tot4', 'Cat3TotA', 'Cat3TotB',\
       'AvgPurAmt', 'TimeSpent', 'CLV', 'numerical_2', 'avg_sales_lag3',\
       'avg_purchases_lag3', 'avg_purchases_lag6', 'active_months_lag6',\
       'avg_sales_lag12']]))

In [6]:
XvalLR[['purchase_amount', 'Cat_1_Y',\
       'Cat2Tot1', 'Cat2Tot2', 'Cat2Tot3', 'Cat2Tot4', 'Cat3TotA', 'Cat3TotB',\
       'AvgPurAmt', 'TimeSpent', 'CLV', 'numerical_2', 'avg_sales_lag3',\
       'avg_purchases_lag3', 'avg_purchases_lag6', 'active_months_lag6',\
       'avg_sales_lag12']]=preprocessing.scale(np.asarray(XvalLR[['purchase_amount', 'Cat_1_Y',\
       'Cat2Tot1', 'Cat2Tot2', 'Cat2Tot3', 'Cat2Tot4', 'Cat3TotA', 'Cat3TotB',\
       'AvgPurAmt', 'TimeSpent', 'CLV', 'numerical_2', 'avg_sales_lag3',\
       'avg_purchases_lag3', 'avg_purchases_lag6', 'active_months_lag6',\
       'avg_sales_lag12']]))

In [None]:
XtestLR[['purchase_amount', 'Cat_1_Y',\
       'Cat2Tot1', 'Cat2Tot2', 'Cat2Tot3', 'Cat2Tot4', 'Cat3TotA', 'Cat3TotB',\
       'AvgPurAmt', 'TimeSpent', 'CLV', 'numerical_2', 'avg_sales_lag3',\
       'avg_purchases_lag3', 'avg_purchases_lag6', 'active_months_lag6',\
       'avg_sales_lag12']]=preprocessing.scale(np.asarray(XtestLR[['purchase_amount', 'Cat_1_Y',\
       'Cat2Tot1', 'Cat2Tot2', 'Cat2Tot3', 'Cat2Tot4', 'Cat3TotA', 'Cat3TotB',\
       'AvgPurAmt', 'TimeSpent', 'CLV', 'numerical_2', 'avg_sales_lag3',\
       'avg_purchases_lag3', 'avg_purchases_lag6', 'active_months_lag6',\
       'avg_sales_lag12']]))

### Using Grid Search to check for best Hyperparameters

In [7]:
# Fix random seed for reproducibility
seed = np.random.seed(2)

In [81]:
# define baseline model

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(1000, input_dim=24, activation='relu'))
    model.add(Dense(500, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=['accuracy'])
    return model

# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=10000, verbose=1)

In [None]:
kfold = KFold(n_splits=3, random_state=seed)
results = cross_val_score(estimator, Xtrain, Ytrain, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

### Fitting best Model

In [8]:
np.random.seed(3)

In [42]:
reduceLR = ReduceLROnPlateau(monitor='loss',factor=0.1,verbose=1,patience=5)

In [None]:
model = Sequential()
model.add(Dense(500, input_dim=24, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(400, activation='relu',kernel_regularizer=l2(0.005)))
model.add(Dropout(0.2))
model.add(Dense(300, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(150, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(75, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1))
# 2. compile the network
model.compile(loss='mean_squared_error', optimizer='adam')
# 3. fit the network
history = model.fit(XtrainLR, Ytrain, epochs=250, batch_size=30000, callbacks=[reduceLR])

In [51]:
# 4. evaluate the network
losstrain = model.evaluate(XtrainLR,Ytrain)
lossval = model.evaluate(XvalLR, Yval)
print('Training RMSE:',sqrt(losstrain))
print('Validation RMSE:',sqrt(lossval))

Training RMSE: 1.6822908992183263
Validation RMSE: 1.687576354278096


In [45]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_92 (Dense)             (None, 500)               12500     
_________________________________________________________________
dense_93 (Dense)             (None, 400)               200400    
_________________________________________________________________
dense_94 (Dense)             (None, 300)               120300    
_________________________________________________________________
dense_95 (Dense)             (None, 200)               60200     
_________________________________________________________________
dense_96 (Dense)             (None, 150)               30150     
_________________________________________________________________
dense_97 (Dense)             (None, 100)               15100     
_________________________________________________________________
dense_98 (Dense)             (None, 75)                7575      
__________

In [92]:
# 5. make predictions
Ypred = model.predict(Xtest)

In [93]:
targets = pd.DataFrame(Ypred,columns=['target']); targets.to_csv('targets.csv')

In [62]:
# predictions = [float(round(x)) for x in probabilities]
# accuracy = numpy.mean(predictions == Y)
# print("Prediction Accuracy: %.2f%%" % (accuracy*100))

(123623, 1)

### Making feature dataset

This was an experiment done using just the 3 features present in the original training set. I ran a Neural Net on just those features to check if predictability was better.

In [6]:
Xtrain2 = Xtrain[['feature_1_2', 'feature_1_3', 'feature_1_4', 'feature_1_5','feature_2_2', 'feature_2_3', 'feature_3_1']]
Xtest2 = Xtest[['feature_1_2', 'feature_1_3', 'feature_1_4', 'feature_1_5','feature_2_2', 'feature_2_3', 'feature_3_1']]

### NN on features only dataset

In [15]:
np.random.seed(3)

model = Sequential()
model.add(Dense(150, input_dim=7, activation='relu'))
model.add(Dense(80, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(1))
# 2. compile the network
model.compile(loss='mean_squared_error', optimizer='adam', metrics = ['accuracy'])
# 3. fit the network
history = model.fit(Xtrain2, Ytrain, epochs=50, batch_size=15000)
# 4. evaluate the network
loss, accuracy = model.evaluate(Xtrain2, Ytrain)
print("\nLoss: %.2f, Accuracy: %.2f%%" % (loss, accuracy*100))

### Applying Random Forest on features only dataset

#### Importing fresh data, fixing issues again

In [76]:
Train = pd.read_pickle('NewTrain2') 

Train = Train[['first_active_month','feature_1','feature_2','feature_3','target']]

Xtrain = Train.loc[:,Train.columns!='target']; Ytrain = Train.loc[:,'target']

Xtrain.first_active_month = Xtrain.first_active_month.astype('category')

Xtrain,Xval,Ytrain,Yval=train_test_split(Xtrain,Ytrain,test_size=0.2)

Xtrain.first_active_month = Xtrain.first_active_month.cat.codes

Xtrain.drop(['first_active_month'],axis=1,inplace=True); Xval.drop(['first_active_month'],axis=1,inplace=True)

#### Running RF

In [161]:
RF = RandomForestRegressor(n_estimators=500, n_jobs=-1, min_samples_leaf=3, max_features=0.5)
model = RF.fit(Xtrain, Ytrain)

Ypred = RF.predict(Xval)

In [163]:
# RMSE 1st Check
MSE=mean_squared_error(Yval,Ypred)
RMSE=sqrt(MSE);RMSE

3.686474445851263

In [None]:
Test = pd.read_pickle('NewTest2')

In [156]:
FI = pd.Series(RF.feature_importances_,index=Xtrain.columns).sort_values(ascending=False);FI

feature_2    0.474581
feature_1    0.449307
feature_3    0.076112
dtype: float64