In [1]:
import casestudy_tools as tools
df = tools.preprocess()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 37 columns):
AGE                    22223 non-null float64
BILL                   22223 non-null float64
ORGYN                  22223 non-null int64
AFFL                   22223 non-null int32
LTIME                  22223 non-null float64
GENDER_F               22223 non-null uint8
GENDER_M               22223 non-null uint8
GENDER_U               22223 non-null uint8
TV_REG_Border          22223 non-null uint8
TV_REG_C Scotland      22223 non-null uint8
TV_REG_East            22223 non-null uint8
TV_REG_London          22223 non-null uint8
TV_REG_Midlands        22223 non-null uint8
TV_REG_N East          22223 non-null uint8
TV_REG_N Scot          22223 non-null uint8
TV_REG_N West          22223 non-null uint8
TV_REG_S & S East      22223 non-null uint8
TV_REG_S West          22223 non-null uint8
TV_REG_Ulster          22223 non-null uint8
TV_REG_Wales & West    22223 non-null uint8
TV_RE

<h1>Data PreProcessing</h1>

In [2]:
import pandas as pd
df = pd.read_csv("datasets/organics.csv")
#df.dtypes

In [3]:
df = df.drop(['CUSTID', 'LCDATE', 'ORGANICS', 'AGEGRP1', 'AGEGRP2'], axis = 1)
df = df.drop(['NEIGHBORHOOD'], axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 12 columns):
GENDER    19711 non-null object
DOB       22223 non-null object
EDATE     22223 non-null object
AGE       20715 non-null float64
TV_REG    21758 non-null object
NGROUP    21549 non-null object
BILL      22223 non-null float64
REGION    21758 non-null object
CLASS     22223 non-null object
ORGYN     22223 non-null int64
AFFL      21138 non-null float64
LTIME     21942 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 2.0+ MB


In [4]:
# Calculates the years between DOB and EDATE and adds that value to age for missing values.
from datetime import datetime
import numpy as np
dateformat = '%Y-%m-%d'
edate = pd.Timestamp(df['EDATE'][0])
df['DOB'] = pd.to_datetime(df['DOB'], format=dateformat)    # 1
df['DOB'] = df['DOB'].where(df['DOB'] < edate, df['DOB'] -  np.timedelta64(100, 'Y'))   # 2
df['AGE'] = (edate - df['DOB']).astype('<m8[Y]')    # 3

df['AGE']
df = df.drop(['EDATE', 'DOB'], axis = 1)
#df.info()

In [5]:
# denote errorneous values in AFFL column. Should be on scale 1-30.
mask = df['AFFL'] < 1
df.loc[mask, 'AFFL'] = 1
mask = df['AFFL'] > 30
df.loc[mask, 'AFFL'] = 30

# Fill mean values for AFFL column.
df['AFFL'].fillna(df['AFFL'].mean(), inplace=True)
# Convert the scale to integer. Not sure if this is necessary.
df['AFFL'] = df['AFFL'].astype(int)
#df.info()
#df['AFFL'].value_counts(bins=8)
#sorted(df['AFFL'].unique())

In [6]:
# Fills mean values based on age for loyalty time. 
means = df.groupby(['AGE'])['LTIME'].mean()
df = df.set_index(['AGE'])
df['LTIME'] = df['LTIME'].fillna(means)
df = df.reset_index()


In [7]:
df['GENDER'].fillna('U', inplace=True)
#df['GENDER'].unique()

In [8]:
df = pd.get_dummies(df)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 37 columns):
AGE                    22223 non-null float64
BILL                   22223 non-null float64
ORGYN                  22223 non-null int64
AFFL                   22223 non-null int32
LTIME                  22223 non-null float64
GENDER_F               22223 non-null uint8
GENDER_M               22223 non-null uint8
GENDER_U               22223 non-null uint8
TV_REG_Border          22223 non-null uint8
TV_REG_C Scotland      22223 non-null uint8
TV_REG_East            22223 non-null uint8
TV_REG_London          22223 non-null uint8
TV_REG_Midlands        22223 non-null uint8
TV_REG_N East          22223 non-null uint8
TV_REG_N Scot          22223 non-null uint8
TV_REG_N West          22223 non-null uint8
TV_REG_S & S East      22223 non-null uint8
TV_REG_S West          22223 non-null uint8
TV_REG_Ulster          22223 non-null uint8
TV_REG_Wales & West    22223 non-null uint8
TV_RE

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

rs = 10

y = df['ORGYN']
X = df.drop(['ORGYN'], axis=1)
X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=rs)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

In [11]:
from sklearn.neural_network import MLPClassifier

In [12]:
model = MLPClassifier(random_state=rs)
model.fit(X_train, y_train)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print(model)

Train accuracy: 0.825404988429
Test accuracy: 0.812809359532
             precision    recall  f1-score   support

          0       0.84      0.93      0.88      5015
          1       0.69      0.45      0.54      1652

avg / total       0.80      0.81      0.80      6667

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [13]:
model = MLPClassifier(max_iter=100, random_state=rs)
model.fit(X_train, y_train)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print(model)

Train accuracy: 0.825404988429
Test accuracy: 0.812809359532
             precision    recall  f1-score   support

          0       0.84      0.93      0.88      5015
          1       0.69      0.45      0.54      1652

avg / total       0.80      0.81      0.80      6667

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [30]:
model = MLPClassifier(max_iter=80, random_state=rs)
model.fit(X_train, y_train)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print(model)

Train accuracy: 0.825404988429
Test accuracy: 0.812809359532
             precision    recall  f1-score   support

          0       0.84      0.93      0.88      5015
          1       0.69      0.45      0.54      1652

avg / total       0.80      0.81      0.80      6667

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=80, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [14]:
print(X_train.shape)

(15556, 36)


In [15]:
params = {'hidden_layer_sizes': [(x,) for x in range(5, 36, 15)]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=3, n_jobs=-1)
cv.fit(X_train, y_train)

print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

print(cv.best_params_)

Train accuracy: 0.813962458215
Test accuracy: 0.817159142043
             precision    recall  f1-score   support

          0       0.83      0.95      0.89      5015
          1       0.72      0.42      0.53      1652

avg / total       0.81      0.82      0.80      6667

{'hidden_layer_sizes': (5,)}


In [16]:
# new parameters
params = {'hidden_layer_sizes': [(3,), (5,), (7,), (9,)]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

print(cv.best_params_)

Train accuracy: 0.814412445359
Test accuracy: 0.818959052047
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      5015
          1       0.72      0.44      0.55      1652

avg / total       0.81      0.82      0.80      6667

{'hidden_layer_sizes': (3,)}


In [17]:
params = {'hidden_layer_sizes': [(3,), (5,), (7,), (9,)], 'alpha': [0.01,0.001, 0.0001, 0.00001]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

print(cv.best_params_)

Train accuracy: 0.814412445359
Test accuracy: 0.818959052047
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      5015
          1       0.72      0.44      0.55      1652

avg / total       0.81      0.82      0.80      6667

{'alpha': 0.0001, 'hidden_layer_sizes': (3,)}
