In [2]:
import casestudy_tools as tools
df = tools.preprocess()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 37 columns):
AGE                    22223 non-null float64
BILL                   22223 non-null float64
ORGYN                  22223 non-null int64
AFFL                   22223 non-null int64
LTIME                  22223 non-null float64
GENDER_F               22223 non-null uint8
GENDER_M               22223 non-null uint8
GENDER_U               22223 non-null uint8
TV_REG_Border          22223 non-null uint8
TV_REG_C Scotland      22223 non-null uint8
TV_REG_East            22223 non-null uint8
TV_REG_London          22223 non-null uint8
TV_REG_Midlands        22223 non-null uint8
TV_REG_N East          22223 non-null uint8
TV_REG_N Scot          22223 non-null uint8
TV_REG_N West          22223 non-null uint8
TV_REG_S & S East      22223 non-null uint8
TV_REG_S West          22223 non-null uint8
TV_REG_Ulster          22223 non-null uint8
TV_REG_Wales & West    22223 non-null uint8
TV_RE

<h1>Data PreProcessing</h1>

In [2]:
import pandas as pd
df = pd.read_csv("datasets/organics.csv")
#df.dtypes

In [3]:
df = df.drop(['CUSTID', 'LCDATE', 'ORGANICS', 'AGEGRP1', 'AGEGRP2'], axis = 1)
df = df.drop(['NEIGHBORHOOD'], axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 12 columns):
GENDER    19711 non-null object
DOB       22223 non-null object
EDATE     22223 non-null object
AGE       20715 non-null float64
TV_REG    21758 non-null object
NGROUP    21549 non-null object
BILL      22223 non-null float64
REGION    21758 non-null object
CLASS     22223 non-null object
ORGYN     22223 non-null int64
AFFL      21138 non-null float64
LTIME     21942 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 2.0+ MB


In [4]:
# Calculates the years between DOB and EDATE and adds that value to age for missing values.
from datetime import datetime
import numpy as np
dateformat = '%Y-%m-%d'
edate = pd.Timestamp(df['EDATE'][0])
df['DOB'] = pd.to_datetime(df['DOB'], format=dateformat)    # 1
df['DOB'] = df['DOB'].where(df['DOB'] < edate, df['DOB'] -  np.timedelta64(100, 'Y'))   # 2
df['AGE'] = (edate - df['DOB']).astype('<m8[Y]')    # 3

df['AGE']
df = df.drop(['EDATE', 'DOB'], axis = 1)
#df.info()

In [5]:
# denote errorneous values in AFFL column. Should be on scale 1-30.
mask = df['AFFL'] < 1
df.loc[mask, 'AFFL'] = 1
mask = df['AFFL'] > 30
df.loc[mask, 'AFFL'] = 30

# Fill mean values for AFFL column.
df['AFFL'].fillna(df['AFFL'].mean(), inplace=True)
# Convert the scale to integer. Not sure if this is necessary.
df['AFFL'] = df['AFFL'].astype(int)
#df.info()
#df['AFFL'].value_counts(bins=8)
#sorted(df['AFFL'].unique())

In [6]:
# Fills mean values based on age for loyalty time. 
means = df.groupby(['AGE'])['LTIME'].mean()
df = df.set_index(['AGE'])
df['LTIME'] = df['LTIME'].fillna(means)
df = df.reset_index()


In [7]:
df['GENDER'].fillna('U', inplace=True)
#df['GENDER'].unique()

In [8]:
df = pd.get_dummies(df)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 37 columns):
AGE                    22223 non-null float64
BILL                   22223 non-null float64
ORGYN                  22223 non-null int64
AFFL                   22223 non-null int32
LTIME                  22223 non-null float64
GENDER_F               22223 non-null uint8
GENDER_M               22223 non-null uint8
GENDER_U               22223 non-null uint8
TV_REG_Border          22223 non-null uint8
TV_REG_C Scotland      22223 non-null uint8
TV_REG_East            22223 non-null uint8
TV_REG_London          22223 non-null uint8
TV_REG_Midlands        22223 non-null uint8
TV_REG_N East          22223 non-null uint8
TV_REG_N Scot          22223 non-null uint8
TV_REG_N West          22223 non-null uint8
TV_REG_S & S East      22223 non-null uint8
TV_REG_S West          22223 non-null uint8
TV_REG_Ulster          22223 non-null uint8
TV_REG_Wales & West    22223 non-null uint8
TV_RE

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

rs = 10

y = df['ORGYN']
X = df.drop(['ORGYN'], axis=1)
X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=rs)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

In [4]:
from sklearn.neural_network import MLPClassifier

In [5]:
model = MLPClassifier(random_state=rs)
model.fit(X_train, y_train)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print(model)

Train accuracy: 0.8254049884289021
Test accuracy: 0.8128093595320234
             precision    recall  f1-score   support

          0       0.84      0.93      0.88      5015
          1       0.69      0.45      0.54      1652

avg / total       0.80      0.81      0.80      6667

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [13]:
model = MLPClassifier(max_iter=100, random_state=rs)
model.fit(X_train, y_train)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print(model)

Train accuracy: 0.825404988429
Test accuracy: 0.812809359532
             precision    recall  f1-score   support

          0       0.84      0.93      0.88      5015
          1       0.69      0.45      0.54      1652

avg / total       0.80      0.81      0.80      6667

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [14]:
model = MLPClassifier(max_iter=80, random_state=rs)
model.fit(X_train, y_train)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print(model)

Train accuracy: 0.825404988429
Test accuracy: 0.812809359532
             precision    recall  f1-score   support

          0       0.84      0.93      0.88      5015
          1       0.69      0.45      0.54      1652

avg / total       0.80      0.81      0.80      6667

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=80, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [15]:
print(X_train.shape)

(15556, 36)


In [16]:
params = {'hidden_layer_sizes': [(x,) for x in range(5, 36, 15)]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=3, n_jobs=-1)
cv.fit(X_train, y_train)

print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

print(cv.best_params_)

Train accuracy: 0.813962458215
Test accuracy: 0.817159142043
             precision    recall  f1-score   support

          0       0.83      0.95      0.89      5015
          1       0.72      0.42      0.53      1652

avg / total       0.81      0.82      0.80      6667

{'hidden_layer_sizes': (5,)}


In [17]:
# new parameters
params = {'hidden_layer_sizes': [(3,), (5,), (7,), (9,)]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

print(cv.best_params_)

Train accuracy: 0.814412445359
Test accuracy: 0.818959052047
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      5015
          1       0.72      0.44      0.55      1652

avg / total       0.81      0.82      0.80      6667

{'hidden_layer_sizes': (3,)}


In [None]:
params = {'hidden_layer_sizes': [(3,)], 'alpha': [0.0001]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

print(cv.best_params_)

In [19]:
columns_to_transform = ['AGE', 'AFFL', 'GENDER_F', 'GENDER_M',
                        'GENDER_U', 'REGION_North']

# copy the dataframe
df_log = df.copy()

# transform the columns with np.log
for col in columns_to_transform:
    df_log[col] = df_log[col].apply(lambda x: x+1)
    df_log[col] = df_log[col].apply(np.log)
    
# create X, y and train test data partitions
y_log = df_log['ORGYN']
X_log = df_log.drop(['ORGYN'], axis=1)
X_mat_log = X_log.as_matrix()
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_mat_log, y_log, test_size=0.3, stratify=y_log, 
                                                                    random_state=rs)

# standardise them again
scaler_log = StandardScaler()
X_train_log = scaler_log.fit_transform(X_train_log, y_train_log)
X_test_log = scaler_log.transform(X_test_log)

In [20]:
params = {'hidden_layer_sizes': [(3,), (5,), (7,), (9,)], 'alpha': [0.01,0.001, 0.0001, 0.00001]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_log, y_train_log)

print("Train accuracy:", cv.score(X_train_log, y_train_log))
print("Test accuracy:", cv.score(X_test_log, y_test_log))

y_pred = cv.predict(X_test_log)
print(classification_report(y_test_log, y_pred))

print(cv.best_params_)


Train accuracy: 0.813641038827
Test accuracy: 0.815359232038
             precision    recall  f1-score   support

          0       0.83      0.94      0.88      5015
          1       0.72      0.42      0.53      1652

avg / total       0.80      0.82      0.80      6667

{'alpha': 0.0001, 'hidden_layer_sizes': (7,)}


In [21]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

rfe = RFECV(estimator = LogisticRegression(random_state=rs), cv=10)
rfe.fit(X_train_log, y_train_log)

print(rfe.n_features_)

7


In [22]:
# transform log 
X_train_rfe = rfe.transform(X_train_log)
X_test_rfe = rfe.transform(X_test_log)

# step = int((X_train_rfe.shape[1] + 5)/5);
params = {'hidden_layer_sizes': [(3,), (5,), (7,), (9,)], 'alpha': [0.01,0.001, 0.0001, 0.00001]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_rfe, y_train_log)

print("Train accuracy:", cv.score(X_train_rfe, y_train_log))
print("Test accuracy:", cv.score(X_test_rfe, y_test_log))

y_pred = cv.predict(X_test_rfe)
print(classification_report(y_test_log, y_pred))

print(cv.best_params_)

Train accuracy: 0.812612496786
Test accuracy: 0.819559022049
             precision    recall  f1-score   support

          0       0.83      0.95      0.89      5015
          1       0.73      0.43      0.54      1652

avg / total       0.81      0.82      0.80      6667

{'alpha': 0.001, 'hidden_layer_sizes': (9,)}


In [23]:
from sklearn.tree import DecisionTreeClassifier

params = {'criterion': ['gini', 'entropy'],
          'max_depth': (2, 7),
          'min_samples_leaf': range(10, 60, 10)}

cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(random_state=rs), cv=10)
cv.fit(X_train_log, y_train_log)

GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=10,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': (2, 7), 'min_samples_leaf': range(10, 60, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [24]:
tools.analyse_feature_importance(cv.best_estimator_, X_log.columns)

AGE : 0.472357701337
AFFL : 0.307108179854
GENDER_F : 0.154030659391
BILL : 0.0139631481726
GENDER_U : 0.0124680351293
LTIME : 0.0118353590165
GENDER_M : 0.00641199854201
NGROUP_D : 0.00321957784075
CLASS_Gold : 0.00274400229255
NGROUP_C : 0.00226068376193
TV_REG_London : 0.00192915166357
REGION_Scottish : 0.00176643391756
TV_REG_N East : 0.00141801614302
TV_REG_S & S East : 0.00137564171039
NGROUP_B : 0.00136906635238
NGROUP_E : 0.00134709844487
REGION_South East : 0.00120588379591
TV_REG_S West : 0.000903471373456
CLASS_Silver : 0.000880473869656
REGION_Midlands : 0.000736633602924


In [25]:
from sklearn.feature_selection import SelectFromModel

# use the trained best decision tree from GridSearchCV to select features
# supply the prefit=True parameter to stop SelectFromModel to re-train the model
selectmodel = SelectFromModel(cv.best_estimator_, prefit=True)
X_train_sel_model = selectmodel.transform(X_train_log)
X_test_sel_model = selectmodel.transform(X_test_log)

print(X_train_sel_model.shape)

(15556, 3)


In [1]:
params = {'C': [pow(10, x) for x in range(-6, 4)]}

cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_sel_model, y_train_log)

print("Train accuracy:", cv.score(X_train_sel_model, y_train_log))
print("Test accuracy:", cv.score(X_test_sel_model, y_test_log))

# test the best model
y_pred = cv.predict(X_test_sel_model)
print(classification_report(y_test_log, y_pred))

# print parameters of the best model
print(cv.best_params_)

NameError: name 'GridSearchCV' is not defined