In [1]:
import os
import sys
import re

src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)
data_dir = os.path.join(os.getcwd(), '..', 'data')

# import libraries here; add more as necessary
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

import matplotlib.pyplot as plt

from import_module import import_raw_data
import etl
import pipe

import joblib


# magic word for producing visualizations in notebook
%matplotlib inline

# Supervised Learning Model
Now that you've found which parts of the population are more likely to be customers of the mail-order company, it's time to build a prediction model. Each of the rows in the "MAILOUT" data files represents an individual that was targeted for a mailout campaign. Ideally, we should be able to use the demographic information from each individual to decide whether or not it will be worth it to include that person in the campaign.
The "MAILOUT" data has been split into two approximately equal parts, each with almost 43 000 data rows. In this part, you can verify your model with the "TRAIN" partition, which includes a column, "RESPONSE", that states whether or not a person became a customer of the company following the campaign. In the next part, you'll need to create predictions on the "TEST" partition, where the "RESPONSE" column has been withheld.

Import training data

In [2]:
mailout_train = import_raw_data('Udacity_MAILOUT_052018_TRAIN.csv',data_dir)
mailout_train.head()
mailout_train = etl.clean_data(mailout_train,data_dir,True)

The data dictionary is imported.
The data dictionary is used to map the missing values.
Attribute PRAEGENDE_JUGENDJAHRE_MAINSTREAM is not available in DataFrame.
Attribute PRAEGENDE_JUGENDJAHRE_YEARS is not available in DataFrame.
Attribute CAMEO_INTL_Economic is not available in DataFrame.
Attribute CAMEO_INTL_Family is not available in DataFrame.
The functions engineer_cameo_intl and engineer_praegende_jj are used to engineer additional variables
The following attributes are dropped because they are not in the data dictionary: []
The following attributes are dropped because they have too many missings or too many levels: ['LNR' 'AGER_TYP' 'ALTER_HH' 'ALTER_KIND1' 'ALTER_KIND2' 'ALTER_KIND3'
 'ALTER_KIND4' 'ALTERSKATEGORIE_FEIN' 'CAMEO_DEU_2015' 'CAMEO_INTL_2015'
 'D19_BANKEN_ANZ_12' 'D19_BANKEN_ANZ_24' 'D19_BANKEN_DATUM'
 'D19_BANKEN_OFFLINE_DATUM' 'D19_BANKEN_ONLINE_DATUM'
 'D19_BANKEN_ONLINE_QUOTE_12' 'D19_GESAMT_ANZ_12' 'D19_GESAMT_ANZ_24'
 'D19_GESAMT_DATUM' 'D19_GESAMT_OFFLINE_D

Create transformation pipeline with data dictionary 

In [4]:
data_dict_path = os.path.join(data_dir,'01_preprocessed/','data_dictionary_full.xlsx')
data_dict = pd.read_excel(data_dict_path, index_col=0)
col_trans = pipe.define_pipeline(data_dict)

In [5]:
X_train = mailout_train.drop(columns= 'RESPONSE',axis=1)
y_train = mailout_train["RESPONSE"].values

create pipeline for model fitting and parameter grid

In [6]:
model_pipe = Pipeline([
        ('transform', col_trans),
        ('clf', GradientBoostingClassifier(random_state=42))
        ])

In [None]:
params = [{'clf__max_depth': [3],
           'clf__learning_rate': [0.1],
           'clf__min_samples_split': [4],
           'clf__n_estimators': [50]}]  

In [8]:
params = [{'clf__max_depth': [3,5],
           'clf__learning_rate': [0.1,0.25],
           #'clf__min_samples_split': [2,4],
           'clf__n_estimators': [100]}]   

In [9]:
gridsearch = GridSearchCV(model_pipe,params,scoring='roc_auc',verbose=3,cv=3)
gridsearch.fit(X_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100;, score=0.663 total time= 1.7min
[CV 2/3] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100;, score=0.704 total time= 1.8min
[CV 3/3] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100;, score=0.690 total time= 2.0min
[CV 1/3] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100;, score=0.643 total time= 2.7min
[CV 2/3] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100;, score=0.673 total time= 2.7min
[CV 3/3] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100;, score=0.663 total time= 3.3min
[CV 1/3] END clf__learning_rate=0.25, clf__max_depth=3, clf__n_estimators=100;, score=0.635 total time= 2.0min
[CV 2/3] END clf__learning_rate=0.25, clf__max_depth=3, clf__n_estimators=100;, score=0.672 total time= 1.7min
[CV 3/3] END clf__learning_rate=0.25, clf__max_depth=3, cl

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value=99,
                                                                                                        strategy='constant')),
                                                                                         ('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         array(['AKT_DAT_KL', 'CAMEO_DEUG_2015', 'CJT_GESAMTTYP',
       'CJT_KATALOGNUTZER', 'CJT_TYP_1', 'CJT_TYP_2', 'CJT_TYP_3',
       'CJT_T...
      dtype=object)),
                       

In [18]:
params = [{'clf__max_depth': [2],
           'clf__learning_rate': [0.025],
           'clf__min_samples_split': [4],
           'clf__n_estimators': [30,50]}]  

In [19]:
gridsearch2 = GridSearchCV(model_pipe,params,scoring='roc_auc',verbose=3,cv=3)
gridsearch2.fit(X_train,y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END clf__learning_rate=0.025, clf__max_depth=2, clf__min_samples_split=4, clf__n_estimators=30;, score=0.666 total time=  24.7s
[CV 2/3] END clf__learning_rate=0.025, clf__max_depth=2, clf__min_samples_split=4, clf__n_estimators=30;, score=0.704 total time=  23.4s
[CV 3/3] END clf__learning_rate=0.025, clf__max_depth=2, clf__min_samples_split=4, clf__n_estimators=30;, score=0.698 total time=  27.6s
[CV 1/3] END clf__learning_rate=0.025, clf__max_depth=2, clf__min_samples_split=4, clf__n_estimators=50;, score=0.675 total time=  44.6s
[CV 2/3] END clf__learning_rate=0.025, clf__max_depth=2, clf__min_samples_split=4, clf__n_estimators=50;, score=0.710 total time=  41.9s
[CV 3/3] END clf__learning_rate=0.025, clf__max_depth=2, clf__min_samples_split=4, clf__n_estimators=50;, score=0.706 total time=  43.0s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value=99,
                                                                                                        strategy='constant')),
                                                                                         ('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         array(['AKT_DAT_KL', 'CAMEO_DEUG_2015', 'CJT_GESAMTTYP',
       'CJT_KATALOGNUTZER', 'CJT_TYP_1', 'CJT_TYP_2', 'CJT_TYP_3',
       'CJT_T...
                                             

check results and save gridsearch object

In [10]:
gridsearch.cv_results_

{'mean_fit_time': array([110.75413338, 171.58127356, 106.67617679, 172.57464838]),
 'std_fit_time': array([ 5.87182777, 17.49297435,  9.95087879,  3.60786238]),
 'mean_score_time': array([0.41880171, 0.42896676, 0.39547229, 0.42808731]),
 'std_score_time': array([0.03402533, 0.00293283, 0.00592868, 0.02276105]),
 'param_clf__learning_rate': masked_array(data=[0.1, 0.1, 0.25, 0.25],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_clf__max_depth': masked_array(data=[3, 5, 3, 5],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_clf__n_estimators': masked_array(data=[100, 100, 100, 100],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__learning_rate': 0.1,
   'clf__max_depth': 3,
   'clf__n_estimators': 100},
  {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 100},
  {'clf__learni

In [None]:
gridsearch_path = os.path.join(data_dir,'02_models/','gridsearch.pkl')
joblib.dump(gridsearch, gridsearch_path) 

# Kaggle Competition
Now that you've created a model to predict which individuals are most likely to respond to a mailout campaign, it's time to test that model in competition through Kaggle. If you click on the link here, you'll be taken to the competition page where, if you have a Kaggle account, you can enter.
Your entry to the competition should be a CSV file with two columns. The first column should be a copy of "LNR", which acts as an ID number for each individual in the "TEST" partition. The second column, "RESPONSE", should be some measure of how likely each individual became a customer – this might not be a straightforward probability. As you should have found in Part 2, there is a large output class imbalance, where most individuals did not respond to the mailout. Thus, predicting individual classes and using accuracy does not seem to be an appropriate performance evaluation method. Instead, the competition will be using AUC to evaluate performance. The exact values of the "RESPONSE" column do not matter as much: only that the higher values try to capture as many of the actual customers as possible, early in the ROC curve sweep.

Load previously done object with fitted pipeline

In [None]:
gridsearch = joblib.load(gridsearch_path) 

In [11]:
mailout_test = import_raw_data('Udacity_MAILOUT_052018_TEST.csv',data_dir)
mailout_test_cleaned = etl.clean_data(mailout_test,data_dir)
mailout_test_preds = gridsearch.predict_proba(mailout_test_cleaned)[:,1]

The data dictionary is imported.
The data dictionary is used to map the missing values.
Attribute PRAEGENDE_JUGENDJAHRE_MAINSTREAM is not available in DataFrame.
Attribute PRAEGENDE_JUGENDJAHRE_YEARS is not available in DataFrame.
Attribute CAMEO_INTL_Economic is not available in DataFrame.
Attribute CAMEO_INTL_Family is not available in DataFrame.
The functions engineer_cameo_intl and engineer_praegende_jj are used to engineer additional variables
The following attributes are dropped because they are not in the data dictionary: []
The following attributes are dropped because they have too many missings or too many levels: ['LNR' 'AGER_TYP' 'ALTER_HH' 'ALTER_KIND1' 'ALTER_KIND2' 'ALTER_KIND3'
 'ALTER_KIND4' 'ALTERSKATEGORIE_FEIN' 'CAMEO_DEU_2015' 'CAMEO_INTL_2015'
 'D19_BANKEN_ANZ_12' 'D19_BANKEN_ANZ_24' 'D19_BANKEN_DATUM'
 'D19_BANKEN_OFFLINE_DATUM' 'D19_BANKEN_ONLINE_DATUM'
 'D19_BANKEN_ONLINE_QUOTE_12' 'D19_GESAMT_ANZ_12' 'D19_GESAMT_ANZ_24'
 'D19_GESAMT_DATUM' 'D19_GESAMT_OFFLINE_D

Put all together in dataframe which can be saved afterwards

In [12]:
test_result_df = pd.DataFrame({
  'LNR': mailout_test['LNR'],
  'RESPONSE': mailout_test_preds
}
)

Save in folder models

In [13]:
test_result_path = os.path.join(data_dir,'02_models/','kaggle_output_test.csv')
test_result_df.to_csv(test_result_path,index=False)

