In [None]:
##########################################
# This notebook is used for intial data exploration for the capstone project
##########################################


In [1]:
import sklearn

print('The scikit-learn version is {}.'.format(sklearn.__version__))
# All code tested on 0.18.1, which is needed for sklearn.neural_network/MLPClassifier

The scikit-learn version is 0.18.1.


First, we import the data

In [None]:
import pandas as pd
import numpy as np
import glob
from IPython.display import display # Allows the use of display() for DataFrames


#data = pd.read_csv('data/transactions_200607.csv')  #import one file

#import all csvs (courtesy of http://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe)

path =r'data' # use your path
allFiles = glob.glob(path + "/tr*.csv")

#print allFiles
data = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
data = pd.concat(list_)

#print(data.head(20))


print('Before DropNA',len(data))
data=data.dropna()
print('After DropNA',len(data))

#consider dropping these categories, but will try with them first.

#data = data[data.CUST_PRICE_SENSITIVITY != 'XX']
#data = data[data.CUST_LIFESTAGE != 'OT']
#print('After Uninteresting categories',len(data))
#list(data.columns.values)



Reformat the data into a wide dataset using Prod_code_20 as the lowest level of aggregation.  This is as recommended by Apeh et al. in Customer Profile Classification Using Transactional Data (https://core.ac.uk/download/pdf/4899037.pdf).  Then, to predict the basket, we just apply the value for the predicted customer.

In [None]:
#create customerprofiles.  Simplest version will be to sum spend in each category in PROD_CODE_20, keeping the target variable as well
#pivot code from http://stackoverflow.com/questions/41046766/using-and-graphing-the-results-of-a-crosstab-dataframe-in-python

data_cross=data.pivot_table(index='CUST_CODE', columns='PROD_CODE_20', values='SPEND', aggfunc=np.sum, fill_value=0)
data_cross.reset_index(level=['CUST_CODE'], inplace=True)

#group the variables that are unique to each basket
byCustomer=data.groupby(['CUST_CODE'])
targetsByCustomer=pd.DataFrame(byCustomer['CUST_LIFESTAGE', 'CUST_PRICE_SENSITIVITY'].first())
targetsByCustomer.reset_index(level=['CUST_CODE'], inplace=True)

sumsByCustomer=pd.DataFrame(byCustomer['SPEND'].sum())
sumsByCustomer.reset_index(level=['CUST_CODE'], inplace=True)

data_cross_day=data.pivot_table(index='CUST_CODE', columns='SHOP_WEEKDAY', values='SPEND', aggfunc=np.sum, fill_value=0)
data_cross_day.reset_index(level=['CUST_CODE'], inplace=True)
data_cross_hour=data.pivot_table(index='CUST_CODE', columns='SHOP_HOUR', values='SPEND', aggfunc=np.sum, fill_value=0)
data_cross_hour.reset_index(level=['CUST_CODE'], inplace=True)
data_cross_basketsize = data.pivot_table(index='CUST_CODE', columns='BASKET_SIZE', values='SPEND', aggfunc=len, fill_value=0)
data_cross_basketsize.reset_index(level=['CUST_CODE'], inplace=True)
data_cross_baskettype = data.pivot_table(index='CUST_CODE', columns='BASKET_TYPE', values='SPEND', aggfunc=len, fill_value=0)
data_cross_baskettype.reset_index(level=['CUST_CODE'], inplace=True)


data_cross = pd.merge(data_cross, data_cross_day, how='inner', on = 'CUST_CODE')
data_cross = pd.merge(data_cross, data_cross_hour, how='inner', on = 'CUST_CODE')
data_cross = pd.merge(data_cross, data_cross_basketsize, how='inner', on = 'CUST_CODE')
data_cross = pd.merge(data_cross, data_cross_baskettype, how='inner', on = 'CUST_CODE')

data_cross = pd.merge(data_cross, sumsByCustomer, how='inner', on = 'CUST_CODE')
data_cross = pd.merge(data_cross, targetsByCustomer, how='inner', on = 'CUST_CODE')

#reset the index to what it should be for the rest of the analysis
data_cross.set_index(['CUST_CODE'], inplace=True)

print(data_cross.head(5))
#list(data_cross.columns.values)
#print(data_cross.describe(include='all'))

In [None]:
print(data_cross_day.head())
print(data_cross_basketsize.head())
print(data_cross_baskettype.head())

# Some visualizations

In [None]:
import matplotlib.pyplot as plt  #http://pandas.pydata.org/pandas-docs/stable/visualization.html
%matplotlib inline

plt.figure()
plt.ylabel('# Customers')
#http://pbpython.com/simple-graphing-pandas.html
    
#plt.subplot(211)  
plt.suptitle('Life Stage Class Distribution')
lsPlot = data_cross.groupby(['CUST_LIFESTAGE'])['SPEND'].count().plot(kind='bar')
#plt.subplot(212)  

#plt.suptitle('Price Sensitivity Class Distribution')
#psPlot = data_cross.groupby(['CUST_PRICE_SENSITIVITY'])['SPEND'].count().plot(kind='bar')

plt.xlabel('Class')

# Split the data.

In [None]:
#create the training, testing split.  can't use sklearn.cross_validation.train_test_split since I have two targets
# could probably consider stratified sampling here

#first do a 70-30 split.  
train_X=data_cross.sample(frac=0.7,random_state=42)
test_X=data_cross.drop(train_X.index)

#pop off the classifiers
train_y = train_X[["CUST_LIFESTAGE","CUST_PRICE_SENSITIVITY"]] #potentially for use in multi-output decision trees
train_y_LS = train_X.pop("CUST_LIFESTAGE")
train_y_PS = train_X.pop("CUST_PRICE_SENSITIVITY")

test_y = test_X[["CUST_LIFESTAGE","CUST_PRICE_SENSITIVITY"]] #potentially for use in multi-output decision trees
test_y_LS = test_X.pop("CUST_LIFESTAGE")
test_y_PS = test_X.pop("CUST_PRICE_SENSITIVITY")


# Downsample the training so all strata are equal

In [None]:
#based on  https://www.datarobot.com/blog/classification-with-scikit-learn/

rng = np.random.RandomState(42)

#create a copy:
data_cross_for_balanced_PS = data_cross
data_cross_for_balanced_LS = data_cross

#first do a 70-30 split.  
balanced_PS_train_X=data_cross_for_balanced_PS.sample(frac=0.7,random_state=42)
balanced_PS_test_X=data_cross_for_balanced_PS.drop(balanced_PS_train_X.index)

#downsample the training data
print(balanced_PS_train_X.groupby(['CUST_PRICE_SENSITIVITY'])['SPEND'].count())
min_count_PS = min(balanced_PS_train_X.groupby(['CUST_PRICE_SENSITIVITY'])['SPEND'].count())
print (min_count_PS)
indices_LA = np.where(balanced_PS_train_X.CUST_PRICE_SENSITIVITY == 'LA')[0]
rng.shuffle(indices_LA)
balanced_PS_train_X = balanced_PS_train_X.drop(balanced_PS_train_X.index[indices_LA[min_count_PS:]])

indices_MM = np.where(balanced_PS_train_X.CUST_PRICE_SENSITIVITY == 'MM')[0]
rng.shuffle(indices_MM)
balanced_PS_train_X = balanced_PS_train_X.drop(balanced_PS_train_X.index[indices_MM[min_count_PS:]])

indices_UM = np.where(balanced_PS_train_X.CUST_PRICE_SENSITIVITY == 'UM')[0]
rng.shuffle(indices_UM)
balanced_PS_train_X = balanced_PS_train_X.drop(balanced_PS_train_X.index[indices_UM[min_count_PS:]])

indices_XX = np.where(balanced_PS_train_X.CUST_PRICE_SENSITIVITY == 'XX')[0]
rng.shuffle(indices_XX)
balanced_PS_train_X = balanced_PS_train_X.drop(balanced_PS_train_X.index[indices_XX[min_count_PS:]])

print('After downsampling', balanced_PS_train_X.groupby(['CUST_PRICE_SENSITIVITY'])['SPEND'].count())

#pop off the classifiers
balanced_PS_train_y_LS = balanced_PS_train_X.pop("CUST_LIFESTAGE") #don't need this
balanced_PS_train_y_PS = balanced_PS_train_X.pop("CUST_PRICE_SENSITIVITY")

balanced_PS_test_y_LS = balanced_PS_test_X.pop("CUST_LIFESTAGE") #don't need this
balanced_PS_test_y_PS = balanced_PS_test_X.pop("CUST_PRICE_SENSITIVITY")



#first do a 70-30 split.  
balanced_LS_train_X=data_cross_for_balanced_LS.sample(frac=0.7,random_state=42)
balanced_LS_test_X=data_cross_for_balanced_LS.drop(balanced_LS_train_X.index)

#downsample the training data
print(balanced_LS_train_X.groupby(['CUST_LIFESTAGE'])['SPEND'].count())
min_count_LS = min(balanced_LS_train_X.groupby(['CUST_LIFESTAGE'])['SPEND'].count())

indices_OA = np.where(balanced_LS_train_X.CUST_LIFESTAGE == 'OA')[0]
rng.shuffle(indices_OA)
balanced_LS_train_X = balanced_LS_train_X.drop(balanced_LS_train_X.index[indices_OA[min_count_LS:]])

indices_OF = np.where(balanced_LS_train_X.CUST_LIFESTAGE == 'OF')[0]
rng.shuffle(indices_OF)
balanced_LS_train_X = balanced_LS_train_X.drop(balanced_LS_train_X.index[indices_OF[min_count_LS:]])

indices_OT = np.where(balanced_LS_train_X.CUST_LIFESTAGE == 'OT')[0]
rng.shuffle(indices_OT)
balanced_LS_train_X = balanced_LS_train_X.drop(balanced_LS_train_X.index[indices_OT[min_count_LS:]])

indices_PE = np.where(balanced_LS_train_X.CUST_LIFESTAGE == 'PE')[0]
rng.shuffle(indices_PE)
balanced_LS_train_X = balanced_LS_train_X.drop(balanced_LS_train_X.index[indices_PE[min_count_LS:]])

indices_YA = np.where(balanced_LS_train_X.CUST_LIFESTAGE == 'YA')[0]
rng.shuffle(indices_YA)
balanced_LS_train_X = balanced_LS_train_X.drop(balanced_LS_train_X.index[indices_YA[min_count_LS:]])

indices_YF = np.where(balanced_LS_train_X.CUST_LIFESTAGE == 'YF')[0]
rng.shuffle(indices_YF)
balanced_LS_train_X = balanced_LS_train_X.drop(balanced_LS_train_X.index[indices_YF[min_count_LS:]])

print('After downsampling',balanced_LS_train_X.groupby(['CUST_LIFESTAGE'])['SPEND'].count())

#pop off the classifiers
balanced_LS_train_y_LS = balanced_LS_train_X.pop("CUST_LIFESTAGE")
balanced_LS_train_y_PS = balanced_LS_train_X.pop("CUST_PRICE_SENSITIVITY") #don't need this

balanced_LS_test_y_LS = balanced_LS_test_X.pop("CUST_LIFESTAGE")
balanced_LS_test_y_PS = balanced_LS_test_X.pop("CUST_PRICE_SENSITIVITY") #don't need this

# Upsample the training so all strata are equal

In [None]:
#upsampling for PS, per http://www.site.uottawa.ca/~nat/Courses/csi5388/Class-Imbalances.ppt
from sklearn.utils import resample
data_cross_upsampled_PS = data_cross

#first do a 70-30 split.  
upsampled_PS_train_X=data_cross_upsampled_PS.sample(frac=0.7,random_state=42)
upsampled_PS_test_X=data_cross_upsampled_PS.drop(upsampled_PS_train_X.index)

print(upsampled_PS_train_X.groupby(['CUST_PRICE_SENSITIVITY'])['SPEND'].count())
max_count_PS = max(upsampled_PS_train_X.groupby(['CUST_PRICE_SENSITIVITY'])['SPEND'].count())

subset_LA = upsampled_PS_train_X[upsampled_PS_train_X.CUST_PRICE_SENSITIVITY == 'LA']
up_subset_LA = resample(subset_LA, n_samples=max_count_PS-len(subset_LA))
subset_MM = upsampled_PS_train_X[upsampled_PS_train_X.CUST_PRICE_SENSITIVITY == 'MM']
up_subset_MM = resample(subset_MM, n_samples=max_count_PS-len(subset_MM))
subset_UM = upsampled_PS_train_X[upsampled_PS_train_X.CUST_PRICE_SENSITIVITY == 'UM']
up_subset_UM = resample(subset_UM, n_samples=max_count_PS-len(subset_UM))
subset_XX = upsampled_PS_train_X[upsampled_PS_train_X.CUST_PRICE_SENSITIVITY == 'XX']
up_subset_XX = resample(subset_XX, n_samples=max_count_PS-len(subset_XX))

upsampled_PS_train_X = pd.concat([subset_LA,up_subset_LA, subset_UM, up_subset_UM, 
                                     subset_MM, up_subset_MM, subset_XX, up_subset_XX])

print(upsampled_PS_train_X.groupby(['CUST_PRICE_SENSITIVITY'])['SPEND'].count())


#pop off the classifiers
upsampled_PS_train_y_LS = upsampled_PS_train_X.pop("CUST_LIFESTAGE") #don't need this
upsampled_PS_train_y_PS = upsampled_PS_train_X.pop("CUST_PRICE_SENSITIVITY") 
upsampled_PS_test_y_LS = upsampled_PS_test_X.pop("CUST_LIFESTAGE") #don't need this
upsampled_PS_test_y_PS = upsampled_PS_test_X.pop("CUST_PRICE_SENSITIVITY") 

#upsampling for LS
data_cross_upsampled_LS = data_cross

#first do a 70-30 split.  
upsampled_LS_train_X=data_cross_upsampled_LS.sample(frac=0.7,random_state=42)
upsampled_LS_test_X=data_cross_upsampled_LS.drop(upsampled_LS_train_X.index)

#upsample the training data

print(upsampled_LS_train_X.groupby(['CUST_LIFESTAGE'])['SPEND'].count())
max_count_LS = max(upsampled_LS_train_X.groupby(['CUST_LIFESTAGE'])['SPEND'].count())

subset_OA = upsampled_LS_train_X[upsampled_LS_train_X.CUST_LIFESTAGE == 'OA']
up_subset_OA = resample(subset_OA, n_samples=max_count_LS-len(subset_OA))
subset_OT = upsampled_LS_train_X[upsampled_LS_train_X.CUST_LIFESTAGE == 'OT']
up_subset_OT = resample(subset_OT, n_samples=max_count_LS-len(subset_OT))
subset_PE = upsampled_LS_train_X[upsampled_LS_train_X.CUST_LIFESTAGE == 'PE']
up_subset_PE = resample(subset_PE, n_samples=max_count_LS-len(subset_PE))
subset_YA = upsampled_LS_train_X[upsampled_LS_train_X.CUST_LIFESTAGE == 'YA']
up_subset_YA = resample(subset_YA, n_samples=max_count_LS-len(subset_YA))
subset_OF = upsampled_LS_train_X[upsampled_LS_train_X.CUST_LIFESTAGE == 'OF']
up_subset_OF = resample(subset_OF, n_samples=max_count_LS-len(subset_OF))
subset_YF = upsampled_LS_train_X[upsampled_LS_train_X.CUST_LIFESTAGE == 'YF']
up_subset_YF = resample(subset_YF, n_samples=max_count_LS-len(subset_YF))

upsampled_LS_train_X = pd.concat([up_subset_OA,up_subset_OF, up_subset_YA, up_subset_YF, up_subset_PE, up_subset_OT,
                                     subset_OA, subset_OF, subset_YA, subset_YF, subset_PE, subset_OT ])

print(upsampled_LS_train_X.groupby(['CUST_LIFESTAGE'])['SPEND'].count())

#pop off the classifiers
upsampled_LS_train_y_LS = upsampled_LS_train_X.pop("CUST_LIFESTAGE")
upsampled_LS_train_y_PS = upsampled_LS_train_X.pop("CUST_PRICE_SENSITIVITY") #don't need this

upsampled_LS_test_y_LS = upsampled_LS_test_X.pop("CUST_LIFESTAGE")
upsampled_LS_test_y_PS = upsampled_LS_test_X.pop("CUST_PRICE_SENSITIVITY") #don't need this






# Create some helper functions

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score


#Create a function to test classifiers and create simple output
def tryClassifier_PS (title,psClassifier, train_X, test_X, train_y_PS, test_y_PS):
    ps_pred_train = psClassifier.predict(train_X)
    ps_pred_test = psClassifier.predict(test_X)
    
    print(title)
    #print('Price Sensitivity Training Accuracy: ', psClassifier.score(train_X, train_y_PS))
    print('Price Sensitivity Training Accuracy: ', accuracy_score(train_y_PS, ps_pred_train))
    print('Price Sensitivity Training Precision: ', precision_score(train_y_PS, ps_pred_train, average='weighted'))
    print('Price Sensitivity Training Recall: ', recall_score(train_y_PS, ps_pred_train, average='weighted'))
    print('Price Sensitivity Training Confusion Matrix:')
    print(confusion_matrix(train_y_PS, ps_pred_train, labels=["LA", "MM", "UM","XX"]))
    #print('Price Sensitivity Testing Accuracy: ', psClassifier.score(test_X, test_y_PS))
    print('Price Sensitivity Testing Accuracy: ', accuracy_score(test_y_PS, ps_pred_test))
    print('Price Sensitivity Testing Precision: ', precision_score(test_y_PS, ps_pred_test, average='weighted'))
    print('Price Sensitivity Testing Recall: ', recall_score(test_y_PS, ps_pred_test, average='weighted'))
    print('Price Sensitivity Testing Confusion Matrix:')
    print(confusion_matrix(test_y_PS, ps_pred_test, labels=["LA", "MM", "UM","XX"]))


def tryClassifier_LS (title, lsClassifier, train_X, test_X, train_y_LS, test_y_LS):

    ls_pred_train = lsClassifier.predict(train_X)
    ls_pred_test = lsClassifier.predict(test_X)
    
    print(title)
    
    #print('Life Stage Training Accuracy: ', lsClassifier.score(train_X, train_y_LS))
    print('Life Stage Training Accuracy: ', accuracy_score(train_y_LS, ls_pred_train))
    print('Life Stage Training Precision: ', precision_score(train_y_LS, ls_pred_train, average='weighted'))
    print('Life Stage Training Recall: ', recall_score(train_y_LS, ls_pred_train, average='weighted'))
    print('Life Stage Training Confusion Matrix:')
    print(confusion_matrix(train_y_LS, ls_pred_train, labels=["OA", "OF", "OT","PE","YA","YF"]))
    #print('Life Stage Testing Accuracy: ', lsClassifier.score(test_X, test_y_LS))
    print('Life Stage Testing Accuracy: ', accuracy_score(test_y_LS, ls_pred_test))
    print('Life Stage Testing Precision: ', precision_score(test_y_LS, ls_pred_test, average='weighted'))
    print('Life Stage Testing Recall: ', recall_score(test_y_LS, ls_pred_test, average='weighted'))
    print('Life Stage Testing Confusion Matrix:')
    print(confusion_matrix(test_y_LS, ls_pred_test, labels=["OA", "OF", "OT","PE","YA","YF"]))

def tryClassifier_LS_OT (title, lsClassifier, train_X, test_X, train_y_LS, test_y_LS):
    ls_pred_train = lsClassifier.predict(train_X)
    ls_pred_test = lsClassifier.predict(test_X)
    
    print(title)
    #print('LifeStage Training Accuracy: ', lsClassifier.score(train_X, train_y_LS))
    print('Life Stage Training Accuracy: ', accuracy_score(train_y_LS, ls_pred_train))
    print('Life Stage Training Precision: ', precision_score(train_y_LS, ls_pred_train, average='weighted'))
    print('Life Stage Training Recall: ', recall_score(train_y_LS, ls_pred_train, average='weighted'))
    print('Life Stage Training Confusion Matrix:')
    print(confusion_matrix(train_y_LS, ls_pred_train, labels=["LS", "OT"]))
    
    #print('LifeStage Testing Accuracy: ', lsClassifier.score(test_X, test_y_LS))
    print('Life Stage Testing Accuracy: ', accuracy_score(test_y_LS, ls_pred_test))
    print('Life Stage Testing Precision: ', precision_score(test_y_LS, ls_pred_test, average='weighted'))
    print('Life Stage Testing Recall: ', recall_score(test_y_LS, ls_pred_test, average='weighted'))
    print('Life Stage Testing Confusion Matrix:')
    print(confusion_matrix(test_y_LS, ls_pred_test, labels=["LS", "OT"]))




#create an accuracy metric
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = accuracy_score(y_true, y_predict)
    
    # Return the score
    return score
from sklearn.metrics import make_scorer
scoring_fnc = make_scorer(performance_metric)

# Business Logic Classifier

In [191]:
#print(test_X.head(10))

#Full Shop  Small Shop  Top Up  XX 

avgCustTrainSpend=train_X['SPEND'].mean()
stDevCustTrainSpend=train_X['SPEND'].std()

testBaskets=test_X[['Full Shop','Small Shop', 'Top Up', 'XX']]
mostFrequentBasketType=pd.DataFrame(testBaskets.idxmax(axis=1))
mostFrequentBasketType.columns = ['MostFrequentBasketType']
mostFrequentBasketType.reset_index(level=['CUST_CODE'], inplace=True)

testBasketSize=test_X[['S','M', 'L']]
mostFrequentBasketSize=pd.DataFrame(testBasketSize.idxmax(axis=1))
mostFrequentBasketSize.columns = ['MostFrequentBasketSize']
mostFrequentBasketSize.reset_index(level=['CUST_CODE'], inplace=True)


testWeekBaskets = test_X[1L]+test_X[7L]
testWeekendBaskets = test_X[2L]+test_X[3L]+test_X[4L]+test_X[5L]+test_X[6L]
weekendShopper = pd.DataFrame(testWeekBaskets<testWeekendBaskets)
weekendShopper.columns = ['weekendShopper']
weekendShopper.reset_index(level=['CUST_CODE'], inplace=True)

#print(weekendShopper)

test_X_Naive=test_X.copy()
test_X_Naive.reset_index(level=['CUST_CODE'], inplace=True)

test_X_Naive = pd.merge(test_X_Naive, mostFrequentBasketType, how='inner', on = 'CUST_CODE')
test_X_Naive = pd.merge(test_X_Naive, mostFrequentBasketSize, how='inner', on = 'CUST_CODE')
test_X_Naive = pd.merge(test_X_Naive, weekendShopper, how='inner', on = 'CUST_CODE')

#print(test_X_Naive.head(10))

#Below based on http://stackoverflow.com/questions/21733893/pandas-dataframe-add-a-field-based-on-multiple-if-statements

test_X_Naive['PS_Pred'] = 'MM'  #default value
test_X_Naive.loc[(test_X_Naive['SPEND']<avgCustTrainSpend-0.5*stDevCustTrainSpend) & (test_X_Naive['MostFrequentBasketType']=='Full Shop'), 'PS_Pred'] = 'LA'
test_X_Naive.loc[(test_X_Naive['SPEND']>avgCustTrainSpend+0.75*stDevCustTrainSpend) & (test_X_Naive['MostFrequentBasketType']<>'Full Shop'), 'PS_Pred'] = 'UM'

#randomly set 13.6% of the values to 'XX, based on http://stackoverflow.com/questions/31389481/numpy-replace-random-elements-in-an-array
#test_X_Naive.loc[np.random.random_sample((len(test_X)))<=0.136, 'PS_Pred'] = 'XX'


test_X_Naive['LS_Pred'] = 'PE'  #default value
test_X_Naive.loc[(test_X_Naive['weekendShopper']==True) & (test_X_Naive['MostFrequentBasketSize']=='L'), 'LS_Pred'] = 'OF'
test_X_Naive.loc[(test_X_Naive['weekendShopper']==True) & (test_X_Naive['MostFrequentBasketSize']=='M'), 'LS_Pred'] = 'YF'
test_X_Naive.loc[(test_X_Naive['weekendShopper']==True) & (test_X_Naive['MostFrequentBasketSize']=='S') & (test_X_Naive['MostFrequentBasketType']=='Small Shop'), 'LS_Pred'] = 'YA'
test_X_Naive.loc[(test_X_Naive['weekendShopper']==True) & (test_X_Naive['MostFrequentBasketSize']=='S') & (test_X_Naive['MostFrequentBasketType']<>'Small Shop'), 'LS_Pred'] = 'OA'
#randomly set 49.6% values to 'OT'
test_X_Naive.loc[np.random.random_sample((len(test_X)))<=0.496, 'LS_Pred'] = 'OT'


BL_Pred_PS = test_X_Naive.pop("PS_Pred")
print(confusion_matrix(test_y_PS, BL_Pred_PS, labels=["LA", "MM", "UM","XX"]))
print("Business Logic PS Testing Accuracy:",accuracy_score(test_y_PS,BL_Pred_PS))
print('Business Logic PS Testing Precision: ', precision_score(test_y_PS, BL_Pred_PS, average='weighted'))
print('Business Logic PS Testing Recall: ', recall_score(test_y_PS, BL_Pred_PS, average='weighted'))

BL_Pred_LS = test_X_Naive.pop("LS_Pred")
print(confusion_matrix(test_y_LS, BL_Pred_LS, labels=["OA", "OF", "OT","PE","YA","YF"]))
print("Business Logic LS Testing Accuracy:",accuracy_score(test_y_LS, BL_Pred_LS))
print('Business Logic LS Testing Precision: ', precision_score(test_y_LS, BL_Pred_LS, average='weighted'))
print('Business Logic LS Testing Recall: ', recall_score(test_y_LS, BL_Pred_LS, average='weighted'))


[[ 286 2746  190    0]
 [ 346 4158  471    0]
 [ 134 2830  390    0]
 [  10 1834    0    0]]
('Business Logic PS Testing Accuracy:', 0.36088092571855168)
('Business Logic PS Testing Precision: ', 0.31506449969866618)
('Business Logic PS Testing Recall: ', 0.36088092571855168)
[[   0  491  831   34   39  241]
 [   0  155  272   13    4   57]
 [   6 1654 3328  507  238  897]
 [   1  288  538   35   33  202]
 [   0  551  959   50   37  309]
 [   1  646  793   37   15  133]]
('Business Logic LS Testing Accuracy:', 0.27532661440836131)

  'precision', 'predicted', average, warn_for)



('Business Logic LS Testing Precision: ', 0.27401717499124545)
('Business Logic LS Testing Recall: ', 0.27532661440836131)


# try the classifiers

In [25]:
from sklearn.ensemble import RandomForestClassifier #using scikitlearn 0.17.1


rfc_ps = RandomForestClassifier(n_estimators = 10000,random_state=42, criterion="entropy"
                                , max_features = 10
                                #, max_leaf_nodes=1000
                                #, min_samples_leaf=20
                                , oob_score=True
                               )
rfc_ls = RandomForestClassifier(n_estimators = 10000,random_state=42, criterion="entropy"
                                , max_features = 10
                                #, max_leaf_nodes=1000
                                #, min_samples_leaf=20
                                , oob_score=True
                               )
#need to handle categorical variables either through pandas.getDummies or http://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
#train_X_num

#print(train_X_dummied.head(10))
#print(list(train_X_dummied.columns.values))

print ('Recall that running training data back through the random forest is not as good an indicator of performance as the OOB score')

print('Unbalanced:')
rfc_ps = rfc_ps.fit(train_X, train_y_PS)

print('Price Sensitivity OOB: ', rfc_ps.oob_score_)
tryClassifier_PS ("Random Forest",rfc_ps, train_X, test_X, train_y_PS, test_y_PS)

rfc_ls = rfc_ls.fit(train_X, train_y_LS)

print('Life Stage OOB: ', rfc_ls.oob_score_)
tryClassifier_LS ("Random Forest", rfc_ls, train_X, test_X, train_y_LS, test_y_LS)



rfc_balanced_ps = RandomForestClassifier(n_estimators = 10000,random_state=42, criterion="entropy"
                                , max_features = 10
                                #, max_leaf_nodes=1000
                                #, min_samples_leaf=20
                                , oob_score=True
                               )
rfc_balanced_ls = RandomForestClassifier(n_estimators = 10000,random_state=42, criterion="entropy"
                                , max_features = 10
                                #, max_leaf_nodes=1000
                                #, min_samples_leaf=20
                                , oob_score=True
                               )

print('Downsampled:')
rfc_balanced_ps = rfc_balanced_ps.fit(balanced_PS_train_X, balanced_PS_train_y_PS)

print('Price Sensitivity OOB: ', rfc_ps.oob_score_)
tryClassifier_PS ("Random Forest",rfc_balanced_ps, balanced_PS_train_X, test_X, balanced_PS_train_y_PS, test_y_PS)

rfc_balanced_ls = rfc_balanced_ls.fit(balanced_LS_train_X, balanced_LS_train_y_LS)

print('Life Stage OOB: ', rfc_ls.oob_score_)
tryClassifier_LS ("Random Forest", rfc_balanced_ls, balanced_LS_train_X, test_X, balanced_LS_train_y_LS, test_y_LS)


Recall that running training data back through the random forest is not as good an indicator of performance as the OOB score
Unbalanced:


KeyboardInterrupt: 

In [193]:
from sklearn.ensemble import RandomForestClassifier #using scikitlearn 0.17.1

rfc_upsampled_ps = RandomForestClassifier(n_estimators = 1000,random_state=42, criterion="entropy"
                                , max_features = 10
                                #, max_leaf_nodes=1000
                                #, min_samples_leaf=20
                                , oob_score=True
                               )
print('Upsampled:')

rfc_upsampled_ps = rfc_upsampled_ps.fit(upsampled_PS_train_X, upsampled_PS_train_y_PS)

print('Price Sensitivity OOB: ', rfc_upsampled_ps.oob_score_)
tryClassifier_PS ("Random Forest", rfc_upsampled_ps, upsampled_PS_train_X, test_X, upsampled_PS_train_y_PS, test_y_PS)


rfc_upsampled_ls = RandomForestClassifier(n_estimators = 1000,random_state=42, criterion="entropy"
                                , max_features = 10
                                #, max_leaf_nodes=1000
                                #, min_samples_leaf=20
                                , oob_score=True
                               )

rfc_upsampled_ls = rfc_upsampled_ls.fit(upsampled_LS_train_X, upsampled_LS_train_y_LS)

print('Life Stage OOB: ', rfc_upsampled_ls.oob_score_)
tryClassifier_LS ("Random Forest", rfc_upsampled_ls, upsampled_LS_train_X, test_X, upsampled_LS_train_y_LS, test_y_LS)




Upsampled:
('Price Sensitivity OOB: ', 0.81114551083591335)
Random Forest
('Price Sensitivity Training Accuracy: ', 1.0)
('Price Sensitivity Training Precision: ', 1.0)
('Price Sensitivity Training Recall: ', 1.0)
Price Sensitivity Training Confusion Matrix:
[[11628     0     0     0]
 [    0 11628     0     0]
 [    0     0 11628     0]
 [    0     0     0 11628]]
('Price Sensitivity Testing Accuracy: ', 0.62814483016050771)
('Price Sensitivity Testing Precision: ', 0.63210697618396605)
('Price Sensitivity Testing Recall: ', 0.62814483016050771)
Price Sensitivity Testing Confusion Matrix:
[[1509 1460  179   74]
 [ 757 3244  856  118]
 [ 121 1265 1900   68]
 [  26   43   14 1761]]
('Life Stage OOB: ', 0.957967647564963)
Random Forest
('Life Stage Training Accuracy: ', 0.9999784890724488)
('Life Stage Training Precision: ', 0.99997849184841059)
('Life Stage Training Recall: ', 0.9999784890724488)
Life Stage Training Confusion Matrix:
[[15496     0     0     0     0     0]
 [    0 15496 

In [9]:
#try a two-stage LS
#step 1, predict OT or not
#step 2, for those we predict !OT then predict the rest.

#get the training data, excluding OTs


train_y_LS_excOT = train_y_LS[train_y_LS<> 'OT']
train_X_excOT = train_X[train_y_LS<> 'OT']

#create a second training set where anything other than 'OT' is labelled 'LS'

train_y_LS_BinaryOT = train_y_LS.copy()
train_y_LS_BinaryOT [train_y_LS_BinaryOT<> 'OT'] = 'LS'

test_y_LS_BinaryOT = test_y_LS.copy()
test_y_LS_BinaryOT [test_y_LS_BinaryOT<> 'OT'] = 'LS'


from sklearn.ensemble import RandomForestClassifier #using scikitlearn 0.17.1

#step 1, create a model that clasiffies as LS or not.
rfc_twostage_ls = RandomForestClassifier(n_estimators = 10000, random_state=42, criterion="entropy"
                                , max_features = 10
                                #, max_leaf_nodes=1000
                                #, min_samples_leaf=20
                                , oob_score=True
                               )
print('twostage step 1:')

rfc_twostage_ls = rfc_twostage_ls.fit(train_X, train_y_LS_BinaryOT)

print('Price Sensitivity OOB: ', rfc_twostage_ls.oob_score_)
tryClassifier_LS_OT ("Random Forest", rfc_twostage_ls, train_X, test_X, train_y_LS_BinaryOT, test_y_LS_BinaryOT)

#get the stage 1 predictions
rfc_twostage_ls_preds = rfc_twostage_ls.predict(test_X)

#print(rfc_twostage_ls.feature_importances_ )

#step 2, using the training data that excludes OT, predict the other classes
from sklearn.metrics import accuracy_score
#fit a model on training data that isn't OT
rfc_twostage_ls_2 = RandomForestClassifier(n_estimators = 1000, random_state=42, criterion="entropy"
                                , max_features = 20
                                , max_leaf_nodes=1000
                                , min_samples_leaf=20
                                , oob_score=True
                               )

print('twostage step 2:')
rfc_twostage_ls_2 = rfc_twostage_ls_2.fit(train_X_excOT,train_y_LS_excOT)

#get the test values where we predict LS instead of 'OT'
stage1_LS_test_X = test_X[rfc_twostage_ls_preds=='LS']
stage1_LS_test_y_LS = test_y_LS[rfc_twostage_ls_preds=='LS']
stage1_LS_test_y_OT = test_y_LS[rfc_twostage_ls_preds=='OT']

#construct a new set of Ys and Predictions to evaluate
rfc_twostage_ls_2_preds = rfc_twostage_ls_2.predict(stage1_LS_test_X)
rfc_twostage_ls_OT_preds = rfc_twostage_ls_preds[rfc_twostage_ls_preds=='OT']

constructedY = pd.concat([stage1_LS_test_y_OT,stage1_LS_test_y_LS])
constructedPreds = pd.concat([pd.DataFrame(rfc_twostage_ls_OT_preds), pd.DataFrame(rfc_twostage_ls_2_preds)])

print(confusion_matrix(test_y_LS_BinaryOT, rfc_twostage_ls_preds, labels=["LS", "OT"]))
print('Price Sensitivity OOB: ', rfc_twostage_ls_2.oob_score_)

print(confusion_matrix(stage1_LS_test_y_LS, rfc_twostage_ls_2_preds, labels=["OA", "OF","PE","YA","YF","OT"]))
print(accuracy_score(stage1_LS_test_y_LS,rfc_twostage_ls_2_preds))
print(confusion_matrix(constructedY, constructedPreds, labels=["OA", "OF", "OT","PE","YA","YF"]))
print(accuracy_score(constructedY,constructedPreds))

twostage step 1:
('Price Sensitivity OOB: ', 0.70023037051257442)
Random Forest
('Life Stage Training Accuracy: ', 0.99993600819095152)
('Life Stage Training Precision: ', 0.99993601644902474)
('Life Stage Training Recall: ', 0.99993600819095152)
Life Stage Training Confusion Matrix:
[[15756     2]
 [    0 15496]]
('Life Stage Testing Accuracy: ', 0.6935423665546846)
('Life Stage Testing Precision: ', 0.70778284445277306)
('Life Stage Testing Recall: ', 0.6935423665546846)
Life Stage Testing Confusion Matrix:
[[5612 1153]
 [2952 3678]]
twostage step 2:
[[5612 1153]
 [2952 3678]]
('Price Sensitivity OOB: ', 0.46033760629521514)
[[ 457    0  254  473  199    0]
 [  39    0    8  158  211    0]
 [ 290    0  476   92   27    0]
 [ 186    0   74  950  325    0]
 [ 118    0   37  460  778    0]
 [ 531    0  365 1401  655    0]]
0.310719290051
[[ 457    0  253  254  473  199]
 [  39    0   85    8  158  211]
 [ 531    0 3678  365 1401  655]
 [ 290    0  212  476   92   27]
 [ 186    0  371   

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_ps = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=10)
balanced_gbc_ps = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=10)
upsampled_gbc_ps = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=10)

#gbc_ls = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=10)

gbc_ps = gbc_ps.fit(train_X, train_y_PS)
balanced_gbc_ps = balanced_gbc_ps.fit(balanced_PS_train_X, balanced_PS_train_y_PS)
upsampled_gbc_ps = upsampled_gbc_ps.fit(upsampled_PS_train_X, upsampled_PS_train_y_PS)
#gbc_ls = rfc_ls.fit(train_X, train_y_LS)

tryClassifier_PS ("Gradient Boosting Classifier, reg",gbc_ps, train_X, test_X, train_y_PS, test_y_PS)
tryClassifier_PS ("Gradient Boosting Classifier, downsampled",balanced_gbc_ps, balanced_PS_train_X, test_X, balanced_PS_train_y_PS, test_y_PS)
tryClassifier_PS ("Gradient Boosting Classifier, upsampled",upsampled_gbc_ps, upsampled_PS_train_X, test_X, upsampled_PS_train_y_PS, test_y_PS)


Gradient Boosting Classifier, reg
('Price Sensitivity Training Accuracy: ', 1.0)
('Price Sensitivity Training Precision: ', 1.0)
('Price Sensitivity Training Recall: ', 1.0)
Price Sensitivity Training Confusion Matrix:
[[ 7467     0     0     0]
 [    0 11628     0     0]
 [    0     0  7936     0]
 [    0     0     0  4223]]
('Price Sensitivity Testing Accuracy: ', 0.62777155655095185)
('Price Sensitivity Testing Precision: ', 0.64021625712260144)
('Price Sensitivity Testing Recall: ', 0.62777155655095185)
Price Sensitivity Testing Confusion Matrix:
[[1482 1565  144   31]
 [ 718 3394  810   53]
 [ 114 1384 1828   28]
 [  29   75   35 1705]]
Gradient Boosting Classifier, downsampled
('Price Sensitivity Training Accuracy: ', 1.0)
('Price Sensitivity Training Precision: ', 1.0)
('Price Sensitivity Training Recall: ', 1.0)
Price Sensitivity Training Confusion Matrix:
[[4223    0    0    0]
 [   0 4223    0    0]
 [   0    0 4223    0]
 [   0    0    0 4223]]
('Price Sensitivity Testing Ac

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_LS = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=10)
balanced_gbc_LS = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=10)
upsampled_gbc_LS=GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=10)
#gbc_ls = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=10)

gbc_LS = gbc_LS.fit(train_X, train_y_LS)
balanced_gbc_LS = balanced_gbc_LS.fit(balanced_LS_train_X, balanced_LS_train_y_LS)
upsampled_gbc_LS = upsampled_gbc_LS.fit(upsampled_LS_train_X, upsampled_LS_train_y_LS)


tryClassifier_LS ("Gradient Boosting Classifier, reg",gbc_LS, train_X, test_X, train_y_LS, test_y_LS)
tryClassifier_LS ("Gradient Boosting Classifier, downsampled" ,balanced_gbc_LS, balanced_LS_train_X, test_X, balanced_LS_train_y_LS, test_y_LS)
tryClassifier_LS ("Gradient Boosting Classifier, upsampled" ,upsampled_gbc_LS, upsampled_LS_train_X, test_X, upsampled_LS_train_y_LS, test_y_LS)


Gradient Boosting Classifier, reg
('Life Stage Training Accuracy: ', 0.99993600819095152)
('Life Stage Training Precision: ', 0.99993601644902474)
('Life Stage Training Recall: ', 0.99993600819095152)
Life Stage Training Confusion Matrix:
[[ 3723     0     0     0     0     0]
 [    0  1255     0     0     0     0]
 [    0     0 15496     0     0     0]
 [    0     0     0  2503     0     0]
 [    0     0     2     0  4467     0]
 [    0     0     0     0     0  3808]]
('Life Stage Testing Accuracy: ', 0.51623740201567747)
('Life Stage Testing Precision: ', 0.46368300558700754)
('Life Stage Testing Recall: ', 0.51623740201567747)
Life Stage Testing Confusion Matrix:
[[ 151    0 1287   95   55   48]
 [   7    5  370    2   16  101]
 [ 111    5 5937  148  155  274]
 [ 105    0  730  251    5    6]
 [  55    3 1554   23  165  106]
 [  28    2 1129   11   49  406]]
Gradient Boosting Classifier, downsampled
('Life Stage Training Accuracy: ', 1.0)
('Life Stage Training Precision: ', 1.0)
('L

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn_ps = KNeighborsClassifier(n_neighbors=10)
balanced_knn_ps = KNeighborsClassifier(n_neighbors=10)
upsampled_knn_ps = KNeighborsClassifier(n_neighbors=10)

knn_ps = knn_ps.fit(train_X,train_y_PS)
balanced_knn_ps = balanced_knn_ps.fit(balanced_PS_train_X,balanced_PS_train_y_PS)
upsampled_knn_ps = upsampled_knn_ps.fit(upsampled_PS_train_X, upsampled_PS_train_y_PS)

tryClassifier_PS ("KNN Classifier, reg",knn_ps, train_X, test_X, train_y_PS, test_y_PS)
tryClassifier_PS ("KNN Classifier, downsampled",balanced_knn_ps, balanced_PS_train_X, test_X, balanced_PS_train_y_PS, test_y_PS)
tryClassifier_PS ("KNN Classifier, upsampled",upsampled_knn_ps, upsampled_PS_train_X, test_X, upsampled_PS_train_y_PS, test_y_PS)

KNN Classifier, reg
('Price Sensitivity Training Accuracy: ', 0.59973123440199649)
('Price Sensitivity Training Precision: ', 0.60491262994696227)
('Price Sensitivity Training Recall: ', 0.59973123440199649)
Price Sensitivity Training Confusion Matrix:
[[3675 2971  639  182]
 [2060 8226 1153  189]
 [1107 3673 3000  156]
 [ 123  192   65 3843]]
('Price Sensitivity Testing Accuracy: ', 0.48294139604329972)
('Price Sensitivity Testing Precision: ', 0.47775244257402044)
('Price Sensitivity Testing Recall: ', 0.48294139604329972)
Price Sensitivity Testing Confusion Matrix:
[[1120 1666  343   93]
 [1233 2841  793  108]
 [ 553 1869  861   71]
 [  64   91   42 1647]]
KNN Classifier, downsampled
('Price Sensitivity Training Accuracy: ', 0.63485673691688371)
('Price Sensitivity Training Precision: ', 0.63232962952582328)
('Price Sensitivity Training Recall: ', 0.63485673691688371)
Price Sensitivity Training Confusion Matrix:
[[2598  900  518  207]
 [1258 2096  726  143]
 [ 947 1107 2012  157]
 [

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn_LS = KNeighborsClassifier(n_neighbors=10)
balanced_knn_LS = KNeighborsClassifier(n_neighbors=10)
upsampled_knn_LS = KNeighborsClassifier(n_neighbors=10)


knn_LS = knn_LS.fit(train_X,train_y_LS)
balanced_knn_LS = balanced_knn_LS.fit(balanced_LS_train_X,balanced_LS_train_y_LS)
balanced_knn_LS = balanced_knn_LS.fit(upsampled_LS_train_X, upsampled_LS_train_y_LS)

tryClassifier_LS ("KNN Classifier, reg",knn_LS, train_X, test_X, train_y_LS, test_y_LS)
tryClassifier_LS ("KNN Classifier, downsampled",balanced_knn_LS, balanced_LS_train_X, test_X, balanced_LS_train_y_LS, test_y_LS)
tryClassifier_LS ("KNN Classifier, upsampled" ,balanced_knn_LS, upsampled_LS_train_X, test_X, upsampled_LS_train_y_LS, test_y_LS)

KNN Classifier, reg
('Life Stage Training Accuracy: ', 0.5651116657067895)
('Life Stage Training Precision: ', 0.5279202770853334)
('Life Stage Training Recall: ', 0.5651116657067895)
Life Stage Training Confusion Matrix:
[[ 1037    31  1879   173   309   294]
 [  107    95   654    10   138   251]
 [  574    70 13453   191   619   589]
 [  350    11  1315   620   111    96]
 [  328    45  2499    72  1159   366]
 [  260    73  1752    55   370  1298]]
('Life Stage Testing Accuracy: ', 0.46868234415826798)
('Life Stage Testing Precision: ', 0.40507190338460847)
('Life Stage Testing Recall: ', 0.46868234415826798)
Life Stage Testing Confusion Matrix:
[[ 228   16  953   98  169  172]
 [  32   16  305    6   51   91]
 [ 362   51 5235  162  429  391]
 [ 197    7  642  149   50   52]
 [ 155   25 1189   46  280  211]
 [ 137   37  877   22  182  370]]
KNN Classifier, downsampled
('Life Stage Training Accuracy: ', 0.68472775564409027)
('Life Stage Training Precision: ', 0.68038839037965038)
('

In [18]:
#requires sklearn 0.18


from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler #from http://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2

scaler=StandardScaler()
balanced_scaler=StandardScaler()
upsampled_scaler=StandardScaler()

scaler.fit(train_X)
balanced_scaler.fit(balanced_PS_train_X)
upsampled_scaler.fit(upsampled_PS_train_X)


scaled_X_train=scaler.transform(train_X)
scaled_X_test=scaler.transform(test_X)
scaled_balanced_PS_X_train=balanced_scaler.transform(balanced_PS_train_X)
scaled_balanced_PS_X_test=balanced_scaler.transform(test_X)
scaled_upsampled_PS_X_train=balanced_scaler.transform(upsampled_PS_train_X)
scaled_upsampled_PS_X_test=balanced_scaler.transform(test_X)


ann_PS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
balanced_ann_PS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
upsampled_ann_PS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
scaled_ann_PS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
scaled_balanced_ann_PS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
scaled_upsampled_ann_PS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)

ann_PS = ann_PS.fit(train_X, train_y_PS)
balanced_ann_PS = balanced_ann_PS.fit(balanced_PS_train_X, balanced_PS_train_y_PS)
upsampled_ann_PS = upsampled_ann_PS.fit(upsampled_PS_train_X, upsampled_PS_train_y_PS)

scaled_ann_PS = scaled_ann_PS.fit(scaled_X_train, train_y_PS)
scaled_balanced_ann_PS = scaled_balanced_ann_PS.fit(scaled_balanced_PS_X_train, balanced_PS_train_y_PS)
scaled_upsampled_ann_PS = scaled_upsampled_ann_PS.fit(scaled_upsampled_PS_X_train, upsampled_PS_train_y_PS)



tryClassifier_PS ("MLP Classifier, reg",ann_PS, train_X, test_X, train_y_PS, test_y_PS)
tryClassifier_PS ("MLP Classifier, downsampled",balanced_ann_PS, balanced_PS_train_X, test_X, balanced_PS_train_y_PS, test_y_PS)
tryClassifier_PS ("MLP Classifier, upsampled",upsampled_ann_PS, upsampled_PS_train_X, test_X, upsampled_PS_train_y_PS, test_y_PS)

tryClassifier_PS ("MLP Classifier Scaled, reg",scaled_ann_PS, scaled_X_train, scaled_X_test, train_y_PS, test_y_PS)
tryClassifier_PS ("MLP Classifier Scaled, downsampled",scaled_balanced_ann_PS, scaled_balanced_PS_X_train, scaled_balanced_PS_X_test, balanced_PS_train_y_PS, test_y_PS)
tryClassifier_PS ("MLP Classifier Scaled, upsampled",scaled_upsampled_ann_PS, scaled_upsampled_PS_X_train, scaled_upsampled_PS_X_test, upsampled_PS_train_y_PS, test_y_PS)



MLP Classifier, reg
('Price Sensitivity Training Accuracy: ', 0.4294170346195687)
('Price Sensitivity Training Precision: ', 0.46250924747087663)
('Price Sensitivity Training Recall: ', 0.4294170346195687)
Price Sensitivity Training Confusion Matrix:
[[3355 2488 1465  159]
 [3595 4777 3060  196]
 [1571 2811 3446  108]
 [1355   45  980 1843]]
('Price Sensitivity Testing Accuracy: ', 0.42792086599477419)
('Price Sensitivity Testing Precision: ', 0.46286154136435592)
('Price Sensitivity Testing Recall: ', 0.42792086599477419)
Price Sensitivity Testing Confusion Matrix:
[[1462 1092  609   59]
 [1579 2047 1278   71]
 [ 686 1196 1416   56]
 [ 585   25  427  807]]
MLP Classifier, downsampled
('Price Sensitivity Training Accuracy: ', 0.3149419843713)
('Price Sensitivity Training Precision: ', 0.26623107099909754)


  'precision', 'predicted', average, warn_for)


('Price Sensitivity Training Recall: ', 0.3149419843713)
Price Sensitivity Training Confusion Matrix:
[[   0 2216 1621  386]
 [   0 2143 1713  367]
 [   0 1761 2027  435]
 [   0 1365 1708 1150]]
('Price Sensitivity Testing Accuracy: ', 0.34199328107502802)
('Price Sensitivity Testing Precision: ', 0.26229036095301422)
('Price Sensitivity Testing Recall: ', 0.34199328107502802)
Price Sensitivity Testing Confusion Matrix:
[[   0 1655 1243  324]
 [   0 2512 1998  465]
 [   0 1417 1609  328]
 [   0  621  763  460]]
MLP Classifier, upsampled
('Price Sensitivity Training Accuracy: ', 0.31357499140006878)
('Price Sensitivity Training Precision: ', 0.27270864467931732)
('Price Sensitivity Training Recall: ', 0.31357499140006878)
Price Sensitivity Training Confusion Matrix:
[[   0 6274 4477  877]
 [   0 6154 4643  831]
 [   0 5114 5576  938]
 [   0 3794 4979 2855]]
('Price Sensitivity Testing Accuracy: ', 0.34572601717058604)
('Price Sensitivity Testing Precision: ', 0.26554102519351314)
('Pric

In [None]:
sample output:
MLP Classifier, reg
('Price Sensitivity Training Accuracy: ', 0.4294170346195687)
('Price Sensitivity Training Precision: ', 0.46250924747087663)
('Price Sensitivity Training Recall: ', 0.4294170346195687)
Price Sensitivity Training Confusion Matrix:
[[3355 2488 1465  159]
 [3595 4777 3060  196]
 [1571 2811 3446  108]
 [1355   45  980 1843]]
('Price Sensitivity Testing Accuracy: ', 0.42792086599477419)
('Price Sensitivity Testing Precision: ', 0.46286154136435592)
('Price Sensitivity Testing Recall: ', 0.42792086599477419)
Price Sensitivity Testing Confusion Matrix:
[[1462 1092  609   59]
 [1579 2047 1278   71]
 [ 686 1196 1416   56]
 [ 585   25  427  807]]
MLP Classifier, downsampled
('Price Sensitivity Training Accuracy: ', 0.3149419843713)
('Price Sensitivity Training Precision: ', 0.26623107099909754)
C:\Users\leander.quiring\AppData\Local\Continuum\Anaconda3\envs\python2\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
('Price Sensitivity Training Recall: ', 0.3149419843713)
Price Sensitivity Training Confusion Matrix:
[[   0 2216 1621  386]
 [   0 2143 1713  367]
 [   0 1761 2027  435]
 [   0 1365 1708 1150]]
('Price Sensitivity Testing Accuracy: ', 0.34199328107502802)
('Price Sensitivity Testing Precision: ', 0.26229036095301422)
('Price Sensitivity Testing Recall: ', 0.34199328107502802)
Price Sensitivity Testing Confusion Matrix:
[[   0 1655 1243  324]
 [   0 2512 1998  465]
 [   0 1417 1609  328]
 [   0  621  763  460]]
MLP Classifier, upsampled
('Price Sensitivity Training Accuracy: ', 0.31357499140006878)
('Price Sensitivity Training Precision: ', 0.27270864467931732)
('Price Sensitivity Training Recall: ', 0.31357499140006878)
Price Sensitivity Training Confusion Matrix:
[[   0 6274 4477  877]
 [   0 6154 4643  831]
 [   0 5114 5576  938]
 [   0 3794 4979 2855]]
('Price Sensitivity Testing Accuracy: ', 0.34572601717058604)
('Price Sensitivity Testing Precision: ', 0.26554102519351314)
('Price Sensitivity Testing Recall: ', 0.34572601717058604)
Price Sensitivity Testing Confusion Matrix:
[[   0 1747 1227  248]
 [   0 2610 1987  378]
 [   0 1501 1598  255]
 [   0  624  797  423]]
MLP Classifier Scaled, reg
('Price Sensitivity Training Accuracy: ', 0.60488257503039611)
('Price Sensitivity Training Precision: ', 0.6138286578003076)
('Price Sensitivity Training Recall: ', 0.60488257503039611)
Price Sensitivity Training Confusion Matrix:
[[3026 3765  288  388]
 [1622 7813 1807  386]
 [ 220 3347 4101  268]
 [  38  183   37 3965]]
('Price Sensitivity Testing Accuracy: ', 0.59962672639044423)
('Price Sensitivity Testing Precision: ', 0.60815799387493874)
('Price Sensitivity Testing Recall: ', 0.59962672639044423)
Price Sensitivity Testing Confusion Matrix:
[[1269 1634  145  174]
 [ 716 3323  754  182]
 [  93 1418 1740  103]
 [  17  101   26 1700]]
MLP Classifier Scaled, downsampled
('Price Sensitivity Training Accuracy: ', 0.64971584181861242)
('Price Sensitivity Training Precision: ', 0.63249076034333707)
('Price Sensitivity Training Recall: ', 0.64971584181861242)
Price Sensitivity Training Confusion Matrix:
[[2772  709  373  369]
 [1486 1439 1025  273]
 [ 483  791 2679  270]
 [  81    6   51 4085]]
('Price Sensitivity Testing Accuracy: ', 0.56005972377752888)
('Price Sensitivity Testing Precision: ', 0.56141551207754814)
('Price Sensitivity Testing Recall: ', 0.56005972377752888)
Price Sensitivity Testing Confusion Matrix:
[[2063  592  285  282]
 [1804 1598 1241  332]
 [ 445  631 2079  199]
 [  53    3   26 1762]]
MLP Classifier Scaled, upsampled
('Price Sensitivity Training Accuracy: ', 0.64572583419332641)
('Price Sensitivity Training Precision: ', 0.62604645171748519)
('Price Sensitivity Training Recall: ', 0.64572583419332641)
Price Sensitivity Training Confusion Matrix:
[[ 7500  2044  1069  1015]
 [ 4076  3622  3220   710]
 [ 1384  1886  7655   703]
 [  230     4   137 11257]]
('Price Sensitivity Testing Accuracy: ', 0.56648002986188872)
('Price Sensitivity Testing Precision: ', 0.56982058230111743)
('Price Sensitivity Testing Recall: ', 0.56648002986188872)
Price Sensitivity Testing Confusion Matrix:
[[2058  572  325  267]
 [1717 1569 1366  323]
 [ 431  535 2193  195]
 [  52    1   23 1768]]

In [19]:
#requires sklearn 0.18


from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler #from http://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2

scaler=StandardScaler()
balanced_scaler=StandardScaler()
upsampled_scaler=StandardScaler()

scaler.fit(train_X)
balanced_scaler.fit(balanced_LS_train_X)
upsampled_scaler.fit(upsampled_LS_train_X)

scaled_X_train=scaler.transform(train_X)
scaled_X_test=scaler.transform(test_X)
scaled_balanced_LS_X_train=balanced_scaler.transform(balanced_LS_train_X)
scaled_balanced_LS_X_test=balanced_scaler.transform(test_X)
scaled_upsampled_LS_X_train=balanced_scaler.transform(upsampled_LS_train_X)
scaled_upsampled_LS_X_test=balanced_scaler.transform(test_X)



ann_LS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
balanced_ann_LS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
upsampled_ann_LS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)

scaled_ann_LS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
scaled_balanced_ann_LS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)
scaled_upsampled_ann_LS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)


ann_LS = ann_LS.fit(train_X, train_y_LS)
balanced_ann_LS = balanced_ann_LS.fit(balanced_LS_train_X, balanced_LS_train_y_LS)
upsampled_ann_LS = upsampled_ann_LS.fit(upsampled_LS_train_X, upsampled_LS_train_y_LS)

scaled_ann_LS = scaled_ann_LS.fit(scaled_X_train, train_y_LS)
scaled_balanced_ann_LS = scaled_balanced_ann_LS.fit(scaled_balanced_LS_X_train, balanced_LS_train_y_LS)
scaled_upsampled_ann_LS = scaled_upsampled_ann_LS.fit(scaled_upsampled_LS_X_train, upsampled_LS_train_y_LS)


#ann_LS = ann_LS.fit(train_X, train_y_LS)

tryClassifier_LS ("MLP Classifier, reg",ann_LS, train_X, test_X, train_y_LS, test_y_LS)
tryClassifier_LS ("MLP Classifier, downsampled",balanced_ann_LS, balanced_LS_train_X, test_X, balanced_LS_train_y_LS, test_y_LS)
tryClassifier_LS ("MLP Classifier, upsampled",upsampled_ann_LS, upsampled_LS_train_X, test_X, upsampled_LS_train_y_LS, test_y_LS)

tryClassifier_LS ("MLP Classifier Scaled, reg",scaled_ann_LS, scaled_X_train, scaled_X_test, train_y_LS, test_y_LS)
tryClassifier_LS ("MLP Classifier Scaled, downsampled",scaled_balanced_ann_LS, scaled_balanced_LS_X_train, scaled_balanced_LS_X_test, balanced_LS_train_y_LS, test_y_LS)
tryClassifier_LS ("MLP Classifier Scaled, downsampled",scaled_upsampled_ann_LS, scaled_upsampled_LS_X_train, scaled_upsampled_LS_X_test, upsampled_LS_train_y_LS, test_y_LS)


MLP Classifier, reg
('Life Stage Training Accuracy: ', 0.17968899980802458)
('Life Stage Training Precision: ', 0.2298118760958785)
('Life Stage Training Recall: ', 0.17968899980802458)
Life Stage Training Confusion Matrix:
[[ 669  229 1337  105  137 1246]
 [ 220   85  581   29   40  300]
 [2875 1648 3690  782 1900 4601]
 [ 345  166  837   74  126  955]
 [ 796  307 1530  106  188 1542]
 [ 638  239 1818   73  130  910]]
('Life Stage Testing Accuracy: ', 0.17991787980589771)
('Life Stage Testing Precision: ', 0.23110982790006593)
('Life Stage Testing Recall: ', 0.17991787980589771)
Life Stage Testing Confusion Matrix:
[[ 284   91  576   45   60  580]
 [ 105   33  198   11   20  134]
 [1161  746 1598  362  846 1917]
 [ 155   78  352   40   59  413]
 [ 311  138  702   38   66  651]
 [ 258  106  763   53   56  389]]
MLP Classifier, downsampled
('Life Stage Training Accuracy: ', 0.29229747675962814)
('Life Stage Training Precision: ', 0.30431890396068667)
('Life Stage Training Recall: ', 0.2

In [20]:
from sklearn.ensemble import AdaBoostClassifier
ada_PS = AdaBoostClassifier(n_estimators=1000)
balanced_ada_PS = AdaBoostClassifier(n_estimators=1000)
upsampled_ada_PS = AdaBoostClassifier(n_estimators=1000)

ada_PS = ada_PS.fit(train_X, train_y_PS)
balanced_ada_PS = balanced_ada_PS.fit(balanced_PS_train_X, balanced_PS_train_y_PS)
upsampled_ada_PS = upsampled_ada_PS.fit(upsampled_PS_train_X, upsampled_PS_train_y_PS)


tryClassifier_PS ("AdaBoost Classifier - PS, reg",ada_PS, train_X, test_X, train_y_PS, test_y_PS)
tryClassifier_PS ("AdaBoost Classifier - PS, downsampled",balanced_ada_PS, balanced_PS_train_X, test_X, balanced_PS_train_y_PS, test_y_PS)
tryClassifier_PS ("AdaBoost Classifier - PS, upsampled",upsampled_ada_PS, upsampled_PS_train_X, test_X, upsampled_PS_train_y_PS, test_y_PS)


AdaBoost Classifier - PS, reg
('Price Sensitivity Training Accuracy: ', 0.52828437959941132)
('Price Sensitivity Training Precision: ', 0.53359031843164517)
('Price Sensitivity Training Recall: ', 0.52828437959941132)
Price Sensitivity Training Confusion Matrix:
[[2376 3230 1776   85]
 [1834 4953 4742   99]
 [ 496 2194 5157   89]
 [  56   72   70 4025]]
('Price Sensitivity Testing Accuracy: ', 0.50496453900709215)
('Price Sensitivity Testing Precision: ', 0.51029787358052681)
('Price Sensitivity Testing Recall: ', 0.50496453900709215)
Price Sensitivity Testing Confusion Matrix:
[[ 905 1466  801   50]
 [ 819 2060 2034   62]
 [ 216  971 2135   32]
 [  41   64   75 1664]]
AdaBoost Classifier - PS, downsampled
('Price Sensitivity Training Accuracy: ', 0.59951456310679607)
('Price Sensitivity Training Precision: ', 0.59156465572500982)
('Price Sensitivity Training Recall: ', 0.59951456310679607)
Price Sensitivity Training Confusion Matrix:
[[2298  909  950   66]
 [1384 1213 1575   51]
 [ 83

In [21]:
from sklearn.ensemble import AdaBoostClassifier
ada_LS = AdaBoostClassifier(n_estimators=1000)
balanced_ada_LS = AdaBoostClassifier(n_estimators=1000)
upsampled_ada_LS = AdaBoostClassifier(n_estimators=1000)

ada_LS = ada_LS.fit(train_X, train_y_LS)
balanced_ada_LS = balanced_ada_LS.fit(balanced_LS_train_X, balanced_LS_train_y_LS)
upsampled_ada_LS = upsampled_ada_LS.fit(upsampled_LS_train_X, upsampled_LS_train_y_LS)


tryClassifier_LS ("AdaBoost Classifier - LS, reg",ada_LS, train_X, test_X, train_y_LS, test_y_LS)
tryClassifier_LS ("AdaBoost Classifier - LS, downsampled",balanced_ada_LS, balanced_LS_train_X, test_X, balanced_LS_train_y_LS, test_y_LS)
tryClassifier_LS ("AdaBoost Classifier - LS, upsampled",upsampled_ada_LS, upsampled_LS_train_X, test_X, upsampled_LS_train_y_LS, test_y_LS)


AdaBoost Classifier - LS, reg
('Life Stage Training Accuracy: ', 0.49702438087924744)
('Life Stage Training Precision: ', 0.48383361476872677)
('Life Stage Training Recall: ', 0.49702438087924744)
Life Stage Training Confusion Matrix:
[[  540   179  1836   828   202   138]
 [   43   335   550    11    84   232]
 [  406   957 11579  1086   716   752]
 [  197     1   926  1342    25    12]
 [  231   495  2378   203   747   415]
 [  115   905  1471    87   239   991]]
('Life Stage Testing Accuracy: ', 0.48114968271743186)
('Life Stage Testing Precision: ', 0.4635005629478931)
('Life Stage Testing Recall: ', 0.48114968271743186)
Life Stage Testing Confusion Matrix:
[[ 197   87  835  349   98   70]
 [  16  112  225    9   40   99]
 [ 178  438 4889  463  325  337]
 [  72    5  428  570   12   10]
 [  86  208 1032  105  292  183]
 [  62  364  670   39  105  385]]
AdaBoost Classifier - LS, downsampled
('Life Stage Training Accuracy: ', 0.45657370517928286)
('Life Stage Training Precision: ', 0

In [22]:
#do ensemble of the best three sets of predictions
#sklearn.ensemble.VotingClassifier kept crashing, so do it manually

rfc_preds = rfc_ps.predict(test_X)
gbc_preds = gbc_ps.predict(test_X)
ann_preds = scaled_ann_PS.predict(scaled_X_test)

from scipy import stats
ensemble_preds = np.transpose(np.array([rfc_preds,gbc_preds,ann_preds]))
voted_preds_PS = stats.mode(ensemble_preds, axis=1)

print(confusion_matrix(test_y_PS, voted_preds_PS.mode, labels=["LA", "MM", "UM","XX"]))
print("Price Sensitivity Accuracy:",performance_metric(test_y_PS,voted_preds_PS.mode))
print('Price Sensitivity Precision: ', precision_score(test_y_PS, voted_preds_PS.mode, average='weighted'))
print('Price Sensitivity Recall: ', recall_score(test_y_PS, voted_preds_PS.mode, average='weighted'))

NameError: name 'rfc_ps' is not defined

In [24]:
rfc_preds = rfc_ls.predict(test_X)
upsampled_rfc_preds = upsampled_rfc_ls.predict(test_X)
gbc_preds = upsampled_gbc_LS.predict(test_X)
ann_preds = scaled_ann_LS.predict(scaled_X_test)
upsampled_rfc_preds = rfc_upsampled_ls.predict(test_X)
ada_preds = ada_LS.predict(test_X)

from scipy import stats
ensemble_preds = np.transpose(np.array([rfc_preds,gbc_preds,ann_preds,upsampled_rfc_preds,ada_preds]))
voted_preds_LS = stats.mode(ensemble_preds, axis=1)

print(confusion_matrix(test_y_LS, voted_preds_LS.mode, labels=["OA", "OF", "OT","PE","YA","YF"]))
print("Life Stage Accuracy:",performance_metric(test_y_LS,voted_preds_LS.mode))
print('Life Stage Precision: ', precision_score(test_y_LS, voted_preds_LS.mode, average='weighted'))
print('Life Stage Recall: ', recall_score(test_y_LS, voted_preds_LS.mode, average='weighted'))

NameError: name 'rfc_ls' is not defined

[[ 310    9 1027  144   65   81]
 [  22   14  291    1   24  149]
 [ 261   57 5461  178  316  357]
 [ 177    2  541  363    6    8]
 [ 122   27 1246   29  309  173]
 [  83   37  866   14   97  528]]
('Accuracy:', 0.5214632325494587)



# initial result summary
For PS, it seems that random forest has the highest accuracy, so we can do a grid-search around it

In [18]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

params = {'n_estimators': [5000,10000], 'max_features': [5,10,15],'random_state':[42], 'criterion':["entropy"],'oob_score':["True"]} #, 
rfc_upsampled_ps=RandomForestClassifier()

grid_upsampled_rfc_ps = GridSearchCV(estimator=rfc_upsampled_ps, param_grid=params, n_jobs=5)

# Fit the grid search object to the data to compute the optimal model
grid_upsampled_rfc_ps = grid_upsampled_rfc_ps.fit(upsampled_PS_train_X, upsampled_PS_train_y_PS)

#print('Price Sensitivity OOB: ', grid_upsampled_rfc_ps.oob_score_)
tryClassifier_PS ("Random Forest", grid_upsampled_rfc_ps, upsampled_PS_train_X, test_X, upsampled_PS_train_y_PS, test_y_PS)

print(grid_upsampled_rfc_ps.best_estimator_)
print(grid_upsampled_rfc_ps.best_params_)

Random Forest
('Price Sensitivity Training Accuracy: ', 1.0)
('Price Sensitivity Training Precision: ', 1.0)
('Price Sensitivity Training Recall: ', 1.0)
Price Sensitivity Training Confusion Matrix:
[[11628     0     0     0]
 [    0 11628     0     0]
 [    0     0 11628     0]
 [    0     0     0 11628]]
('Price Sensitivity Testing Accuracy: ', 0.62575587905935048)
('Price Sensitivity Testing Precision: ', 0.63031556781300335)
('Price Sensitivity Testing Recall: ', 0.62575587905935048)
Price Sensitivity Testing Confusion Matrix:
[[1497 1481  175   69]
 [ 766 3223  869  117]
 [ 104 1282 1901   67]
 [  24   45   14 1761]]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5000, n_jobs=1, oob_score='True', random_state=42,
            verbose=0, warm_

In [7]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

params = {'n_estimators': [500,1000,3000,5000,10000], 'max_features': [5,10,15],'random_state':[42], 'criterion':["entropy"],'oob_score':["True"]} #, 
rfc_upsampled_ls=RandomForestClassifier()

grid_upsampled_rfc_ls = GridSearchCV(estimator=rfc_upsampled_ls, param_grid=params, n_jobs=5)

# Fit the grid search object to the data to compute the optimal model
grid_upsampled_rfc_ls = grid_upsampled_rfc_ls.fit(upsampled_LS_train_X, upsampled_LS_train_y_LS)

#print('Price Sensitivity OOB: ', grid_upsampled_rfc_ls.oob_score_)
tryClassifier_LS ("Random Forest", grid_upsampled_rfc_ls, upsampled_LS_train_X, test_X, upsampled_LS_train_y_LS, test_y_LS)

print(grid_upsampled_rfc_ls.best_estimator_)
print(grid_upsampled_rfc_ls.best_params_)



Random Forest
('Life Stage Training Accuracy: ', 0.9999784890724488)
('Life Stage Training Precision: ', 0.99997849184841059)
('Life Stage Training Recall: ', 0.9999784890724488)
Life Stage Training Confusion Matrix:
[[15496     0     0     0     0     0]
 [    0 15496     0     0     0     0]
 [    0     0 15494     0     2     0]
 [    0     0     0 15496     0     0]
 [    0     0     0     0 15496     0]
 [    0     0     0     0     0 15496]]
('Life Stage Testing Accuracy: ', 0.51519223590892127)
('Life Stage Testing Precision: ', 0.46286981551303791)
('Life Stage Testing Recall: ', 0.51519223590892127)
Life Stage Testing Confusion Matrix:
[[ 228    0  953  153  122  180]
 [  12    0  254    1   42  192]
 [ 196    0 5236  189  461  548]
 [ 169    0  563  337    5   23]
 [  78    0 1094   38  418  278]
 [  36    0  769   14  124  682]]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=5, max_leaf_nodes=None,
    

  'precision', 'predicted', average, warn_for)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
params = {'learning_rate': [0.01,0.05,0.1], 'n_estimators': [500,1000,3000], 'max_depth':[5,10,15]} 
upsampled_gbc_ps=GradientBoostingClassifier()
grid_upsampled_gbc_ps = GridSearchCV(estimator=upsampled_gbc_ps, param_grid=params, n_jobs=5)
grid_upsampled_gbc_ps = grid_upsampled_gbc_ps.fit(upsampled_PS_train_X, upsampled_PS_train_y_PS)
tryClassifier_PS ("Gradient Boosting Classifier, GridSearch", grid_upsampled_gbc_ps, upsampled_PS_train_X, test_X, upsampled_PS_train_y_PS, test_y_PS)

print(grid_upsampled_gbc_ps.best_estimator_)
print(grid_upsampled_gbc_ps.best_params_)

In [None]:
sample output:
Gradient Boosting Classifier, GridSearch
('Price Sensitivity Training: ', 0.838650812338121)
[[3551  343  231  111]
 [ 601 2971  540  116]
 [ 240  391 3550  107]
 [  31   10   20 4175]]
('Price Sensitivity Testing: ', 0.66629339305711088)
[[2278  557  245  142]
 [1244 2399 1151  181]
 [ 282  529 2450   93]
 [  21    8   17 1798]]

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=1000, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
{'n_estimators': 1000, 'learning_rate': 0.01, 'max_depth': 5}

In [None]:
#requires sklearn 0.18


from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler #from http://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

scaler=StandardScaler()
scaler.fit(train_X)

scaled_X_train=scaler.transform(train_X)
scaled_X_test=scaler.transform(test_X)

#params = {'learning_rate': [0.01,0.05,0.1], 'n_estimators': [500,1000,3000], 'max_depth':[5,10,15]} 

params = {'solver':["lbfgs"], 'alpha':[0.00001, 0.0005], 'hidden_layer_sizes':[(100, 100,100),(10), (10,10)], 'random_state':[42]}
scaled_ann_LS=MLPClassifier()

grid_scaled_ann_LS = GridSearchCV(estimator=scaled_ann_LS, param_grid=params, n_jobs=5)
grid_scaled_ann_LS = grid_scaled_ann_LS.fit(scaled_X_train, train_y_LS)

#scaled_ann_LS = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100,100), random_state=42)


tryClassifier_LS ("MLP Classifier Scaled, GridSearch", grid_scaled_ann_LS, scaled_X_train, test_X, train_y_LS, test_y_LS)

print(grid_scaled_ann_LS.best_estimator_)
print(grid_scaled_ann_LS.best_params_)


In [None]:
#requires sklearn 0.18


from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler #from http://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2

scaler=StandardScaler()

scaler.fit(train_X)

scaled_X_train=scaler.transform(train_X)
scaled_X_test=scaler.transform(test_X)

scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(100,10), random_state=43, activation='logistic', max_iter=5000,
                             )
scaled_ann_LS = scaled_ann_LS.fit(scaled_X_train, train_y_LS)


tryClassifier_LS ("MLP Classifier Scaled",scaled_ann_LS, scaled_X_train, scaled_X_test, train_y_LS, test_y_LS)


In [None]:
scaled_ann_LS = MLPClassifier(solver='lbfgs', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42) = .469
scaled_ann_LS = MLPClassifier(solver='lbfgs', alpha=0.0001, learning_rate_init=0.01, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42) = .469
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42) = .497, needs more iterations
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.01, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42) = 0.447
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42, activation='logistic') = 0.508
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42, activation='logistic', max_iter=500) = 0.511
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42, activation='logistic', max_iter=1000) = 0.511
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(20,20), random_state=42, activation='logistic', max_iter=500) = 0.508
scaled_ann_LS = scaled_ann_LS.fit(scaled_X_train, balanced_LS_train_y_LS) = 0.471
scaled_ann_LS = MLPClassifier(solver='adam', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42, activation='logistic', max_iter=500)=0.480
scaled_ann_LS = MLPClassifier(solver='adam', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42, activation='tanh', max_iter=500)=0.485
scaled_ann_LS = scaled_ann_LS.fit(scaled_X_train, upsampled_LS_train_y_LS) = 0.473
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.001, learning_rate='adaptive',
                              hidden_layer_sizes=(200,200), random_state=42, activation='logistic', max_iter=500) = 0.510
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.0001, learning_rate='constant',
                              hidden_layer_sizes=(200,200), random_state=42, activation='logistic', max_iter=1000) = 0.498
scaled_ann_LS = MLPClassifier(solver='sgd', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant',
                              hidden_layer_sizes=(100,10), random_state=42, activation='logistic', max_iter=5000) = 0.512

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

params = {'n_estimators': [1000,3000,5000], 'max_features': [5,10,15], 'random_state':[42], 'criterion':["entropy"]}

rfc_LS=RandomForestClassifier()

grid_rfc_LS = GridSearchCV(estimator=rfc_LS, param_grid=params, n_jobs=5)

# Fit the grid search object to the data to compute the optimal model
grid_rfc_LS = grid_rfc_LS.fit(train_X, train_y_LS)

tryClassifier_LS ("Random Forest, Grid", grid_rfc_LS, train_X, test_X, train_y_LS, test_y_LS)

print(grid_rfc_LS.best_estimator_)
print(grid_rfc_LS.best_params_)

KeyboardInterrupt: 

In [None]:
sample output:
    ('LifeStage Training: ', 0.99910411467332183)
[[ 3717     0     6     0     0     0]
 [    0  1253     2     0     0     0]
 [    0     0 15496     0     0     0]
 [    0     0     3  2500     0     0]
 [    0     0    12     0  4457     0]
 [    0     0     5     0     0  3803]]
('LifeStage Testing: ', 0.51131019036954084)
[[  60    0 1455   53    3   65]
 [   1    0  366    2    0  132]
 [  58    1 6182   67    9  313]
 [  71    0  887  130    0    9]
 [  28    0 1709   10    7  152]
 [   8    0 1136    9    2  470]]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=15, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5000, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)
{'max_features': 15, 'n_estimators': 5000, 'random_state': 42, 'criterion': 'entropy'}

In [None]:
#random useful code snippets
list(data.columns.values)
data['SPEND'].dtype
data_cross.head(10)