In [1]:
import pandas as pd
import pickle
from joblib import dump

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, GridSearchCV

In [3]:
from sklearn.metrics import log_loss, make_scorer, precision_score, recall_score, accuracy_score, f1_score
from joblib import dump, load
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [4]:
df = (pd.read_csv("inputs\\WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv").set_index("Customer"))

In [5]:
df.sample(3)

Unnamed: 0_level_0,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MX48796,California,4873.672682,No,Basic,Bachelor,2/14/11,Unemployed,F,0,Suburban,...,5,0,5,Corporate Auto,Corporate L3,Offer1,Agent,356.08323,Four-Door Car,Medsize
JP55271,Oregon,10633.50496,No,Extended,High School or Below,2/25/11,Employed,F,80046,Rural,...,42,0,4,Personal Auto,Personal L2,Offer2,Call Center,260.627146,SUV,Small
DR93103,Arizona,17556.99121,No,Basic,Master,2/26/11,Employed,F,36212,Rural,...,57,0,2,Personal Auto,Personal L2,Offer1,Agent,119.033795,Two-Door Car,Medsize


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9134 entries, BU79786 to Y167826
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   State                          9134 non-null   object 
 1   Customer Lifetime Value        9134 non-null   float64
 2   Response                       9134 non-null   object 
 3   Coverage                       9134 non-null   object 
 4   Education                      9134 non-null   object 
 5   Effective To Date              9134 non-null   object 
 6   EmploymentStatus               9134 non-null   object 
 7   Gender                         9134 non-null   object 
 8   Income                         9134 non-null   int64  
 9   Location Code                  9134 non-null   object 
 10  Marital Status                 9134 non-null   object 
 11  Monthly Premium Auto           9134 non-null   int64  
 12  Months Since Last Claim        9134 non-null

In [7]:
X = df.drop(['Response'], axis = 1)
y = df.Response.apply(lambda X : 0 if X == 'No' else 1)

In [8]:
# categorical features
cats = [var for var, var_type in X.dtypes.items() if var_type=='object']
# numerical features
nums = [var for var in X.columns if var not in cats]

In [9]:
cats

['State',
 'Coverage',
 'Education',
 'Effective To Date',
 'EmploymentStatus',
 'Gender',
 'Location Code',
 'Marital Status',
 'Policy Type',
 'Policy',
 'Renew Offer Type',
 'Sales Channel',
 'Vehicle Class',
 'Vehicle Size']

In [10]:
nums

['Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Months Since Last Claim',
 'Months Since Policy Inception',
 'Number of Open Complaints',
 'Number of Policies',
 'Total Claim Amount']

In [11]:
# #Custom Transformer that extracts columns passed as argument to its constructor 
# class FeatureSelector(BaseEstimator, TransformerMixin ):
#     #Class Constructor 
#     def __init__( self, feature_names):
#         self._feature_names = feature_names 
        
#     #Return self nothing else to do here    
#     def fit( self, X, y = None ):
#         return self 
    
#     #Method that describes what we need this transformer to do
#     def transform( self, X, y = None ):
#         return X[self._feature_names].values 

In [12]:
#Defining the steps in the categorical pipeline 
cat_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                         ('one_hot_encoder', OneHotEncoder(sparse=False))])
    
#Defining the steps in the numerical pipeline     
num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = ColumnTransformer(transformers = [('num_pipeline', num_pipeline, nums),
                                                  ('cat_pipeline', cat_pipeline, cats)])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
#fit and transform the custom transformer in train
X_train_processed = full_pipeline.fit_transform(X_train)
# transform the test with the trained tansformer
X_test_processed = full_pipeline.transform(X_test)

In [14]:
names = [
    "Nearest Neighbors",
    "XGBoost",
    "Random Forest"
]

In [15]:
classifiers = [
    KNeighborsClassifier(3),
    GradientBoostingClassifier(),
    RandomForestClassifier()
]

In [16]:
scores = {}

# iterate over classifiers
for name, clf in zip(names, classifiers):
    # Cross val prediction    
    cv_preds = cross_val_predict(clf, X_train_processed, y_train, method='predict_proba')
    cv_score = log_loss(y_train, cv_preds)
    
    # holdout data 
    clf.fit(X_train_processed, y_train)
    hd_preds = clf.predict_proba(X_test_processed)
    hd_score = log_loss(y_test, hd_preds)
    
    # append the scores
    scores[name] = [cv_score, hd_score]
    #store the model

In [17]:
scores

{'Nearest Neighbors': [0.6210403921626958, 0.6183440196177838],
 'XGBoost': [0.26516181474693584, 0.27046517105749024],
 'Random Forest': [0.09814905312189907, 0.07476286350133843]}

In [18]:
for model, perf in scores.items():
    print(f"{model} => CV_AUC :  {perf[0]}, Holdout_AUC:{perf[1]}")

Nearest Neighbors => CV_AUC :  0.6210403921626958, Holdout_AUC:0.6183440196177838
XGBoost => CV_AUC :  0.26516181474693584, Holdout_AUC:0.27046517105749024
Random Forest => CV_AUC :  0.09814905312189907, Holdout_AUC:0.07476286350133843


In [19]:
param_grid = [
    {
        'n_estimators': [100, 200],
        'n_jobs': [-1]
    },
    {
        'n_estimators': [50, 100, 200],
        'max_features': ['log2'],
        'n_jobs': [-1]
    },
    {
        'bootstrap': [False],
        'n_estimators': [150, 300],
        'max_features': [2, 4],
        'n_jobs': [-1]
    },
]

In [20]:
scorer = make_scorer(log_loss)

In [21]:
RF = RandomForestClassifier()

In [22]:
grid_search = GridSearchCV(
    RF,
    param_grid,
    cv=10,
    scoring=scorer,
    return_train_score=True
)

In [None]:
%%timeit

grid_search.fit(X_train_processed, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
sk_best = grid_search.best_estimator_

In [None]:
full_pipeline

In [None]:
### Store the one hot encodings (ohe)
cat_step = full_pipeline.get_params()['transformer_list'][-1][-1]
ohe = cat_step.steps[-1][-1]