https://towardsdatascience.com/full-machine-learning-pipeline-from-data-processing-to-model-deployment-4b501740922d

https://www.ahmedbesbes.com/blog/end-to-end-machine-learning

In [1]:
import pandas as pd
import pickle
from joblib import dump

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, GridSearchCV

In [3]:
from sklearn.metrics import log_loss, make_scorer, precision_score, recall_score, accuracy_score, f1_score
from joblib import dump, load
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [4]:
df = (pd.read_csv("data\\Customer-Value-Analysis.csv").set_index("Customer"))

In [5]:
df.sample(3)

Unnamed: 0_level_0,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BX24569,California,3567.503755,Yes,Basic,Bachelor,1/7/11,Unemployed,M,0,Suburban,...,11,0,1,Personal Auto,Personal L2,Offer2,Agent,660.430456,SUV,Medsize
FZ24763,California,5078.451231,No,Basic,Bachelor,1/4/11,Employed,M,35289,Rural,...,94,0,3,Personal Auto,Personal L1,Offer4,Web,123.945627,Four-Door Car,Medsize
EZ78112,California,5610.96434,No,Basic,High School or Below,2/24/11,Employed,F,77493,Urban,...,35,0,3,Special Auto,Special L3,Offer1,Agent,307.963291,Four-Door Car,Medsize


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9134 entries, BU79786 to Y167826
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   State                          9134 non-null   object 
 1   Customer Lifetime Value        9134 non-null   float64
 2   Response                       9134 non-null   object 
 3   Coverage                       9134 non-null   object 
 4   Education                      9134 non-null   object 
 5   Effective To Date              9134 non-null   object 
 6   EmploymentStatus               9134 non-null   object 
 7   Gender                         9134 non-null   object 
 8   Income                         9134 non-null   int64  
 9   Location Code                  9134 non-null   object 
 10  Marital Status                 9134 non-null   object 
 11  Monthly Premium Auto           9134 non-null   int64  
 12  Months Since Last Claim        9134 non-null

In [7]:
X = df.drop(['Response'], axis = 1)
y = df.Response.apply(lambda X : 0 if X == 'No' else 1)

In [8]:
# categorical features
cats = [var for var, var_type in X.dtypes.items() if var_type=='object']
# numerical features
nums = [var for var in X.columns if var not in cats]

In [9]:
cats

['State',
 'Coverage',
 'Education',
 'Effective To Date',
 'EmploymentStatus',
 'Gender',
 'Location Code',
 'Marital Status',
 'Policy Type',
 'Policy',
 'Renew Offer Type',
 'Sales Channel',
 'Vehicle Class',
 'Vehicle Size']

In [10]:
nums

['Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Months Since Last Claim',
 'Months Since Policy Inception',
 'Number of Open Complaints',
 'Number of Policies',
 'Total Claim Amount']

In [11]:
# #Custom Transformer that extracts columns passed as argument to its constructor 
# class FeatureSelector(BaseEstimator, TransformerMixin ):
#     #Class Constructor 
#     def __init__( self, feature_names):
#         self._feature_names = feature_names 
        
#     #Return self nothing else to do here    
#     def fit( self, X, y = None ):
#         return self 
    
#     #Method that describes what we need this transformer to do
#     def transform( self, X, y = None ):
#         return X[self._feature_names].values 

In [12]:
#Defining the steps in the categorical pipeline 
cat_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                         ('one_hot_encoder', OneHotEncoder(sparse=False))])
    
#Defining the steps in the numerical pipeline     
num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = ColumnTransformer(transformers = [('num_pipeline', num_pipeline, nums),
                                                  ('cat_pipeline', cat_pipeline, cats)])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
#fit and transform the custom transformer in train
X_train_processed = full_pipeline.fit_transform(X_train)
# transform the test with the trained tansformer
X_test_processed = full_pipeline.transform(X_test)

In [14]:
pd.DataFrame(X_train_processed).info(verbose=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7307 entries, 0 to 7306
Data columns (total 122 columns):
 #    Column  Dtype  
---   ------  -----  
 0    0       float64
 1    1       float64
 2    2       float64
 3    3       float64
 4    4       float64
 5    5       float64
 6    6       float64
 7    7       float64
 8    8       float64
 9    9       float64
 10   10      float64
 11   11      float64
 12   12      float64
 13   13      float64
 14   14      float64
 15   15      float64
 16   16      float64
 17   17      float64
 18   18      float64
 19   19      float64
 20   20      float64
 21   21      float64
 22   22      float64
 23   23      float64
 24   24      float64
 25   25      float64
 26   26      float64
 27   27      float64
 28   28      float64
 29   29      float64
 30   30      float64
 31   31      float64
 32   32      float64
 33   33      float64
 34   34      float64
 35   35      float64
 36   36      float64
 37   37      float64
 38   38    

In [15]:
pd.DataFrame(X_train_processed).sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,112,113,114,115,116,117,118,119,120,121
1618,0.173276,-1.242714,-0.391116,-0.503057,-1.320996,-0.41699,-0.411632,0.463909,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1594,3.747575,-0.14335,0.358659,0.288744,1.681386,-0.41699,-0.411632,0.244937,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6425,0.003252,1.121995,-0.766003,1.377471,-0.320202,-0.41699,-0.411632,-0.52363,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
full_pipeline.get_params()

{'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('num_pipeline',
   Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                   ('std_scaler', StandardScaler())]),
   ['Customer Lifetime Value',
    'Income',
    'Monthly Premium Auto',
    'Months Since Last Claim',
    'Months Since Policy Inception',
    'Number of Open Complaints',
    'Number of Policies',
    'Total Claim Amount']),
  ('cat_pipeline',
   Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                   ('one_hot_encoder', OneHotEncoder(sparse=False))]),
   ['State',
    'Coverage',
    'Education',
    'Effective To Date',
    'EmploymentStatus',
    'Gender',
    'Location Code',
    'Marital Status',
    'Policy Type',
    'Policy',
    'Renew Offer Type',
    'Sales Channel',
    'Vehicle Class',
    'Vehicle Size'])],
 'verbose': False,
 'verbose_feature_names_out': True,
 'num_pipeline': Pipeline(ste

In [17]:
cat_step = full_pipeline.get_params()['transformers'][-1][-1]
cat_step

['State',
 'Coverage',
 'Education',
 'Effective To Date',
 'EmploymentStatus',
 'Gender',
 'Location Code',
 'Marital Status',
 'Policy Type',
 'Policy',
 'Renew Offer Type',
 'Sales Channel',
 'Vehicle Class',
 'Vehicle Size']

In [18]:
ohe = full_pipeline.get_params()['cat_pipeline__steps'][-1][-1]
ohe

OneHotEncoder(sparse=False)

In [19]:
ohe_categories = dict(zip(cat_step, [ohe] * len(cat_step)))
ohe_categories

{'State': OneHotEncoder(sparse=False),
 'Coverage': OneHotEncoder(sparse=False),
 'Education': OneHotEncoder(sparse=False),
 'Effective To Date': OneHotEncoder(sparse=False),
 'EmploymentStatus': OneHotEncoder(sparse=False),
 'Gender': OneHotEncoder(sparse=False),
 'Location Code': OneHotEncoder(sparse=False),
 'Marital Status': OneHotEncoder(sparse=False),
 'Policy Type': OneHotEncoder(sparse=False),
 'Policy': OneHotEncoder(sparse=False),
 'Renew Offer Type': OneHotEncoder(sparse=False),
 'Sales Channel': OneHotEncoder(sparse=False),
 'Vehicle Class': OneHotEncoder(sparse=False),
 'Vehicle Size': OneHotEncoder(sparse=False)}

In [20]:
output_path = "model\\ohe_categories.pkl"
with open(output_path, 'wb') as output:
        pickle.dump(ohe_categories, output, pickle.HIGHEST_PROTOCOL)

In [21]:
import numpy as np
import math

# labels_dict : {ind_label: count_label}
# mu : parameter to tune 

def create_class_weight(labels_dict,mu=0.43):
    total = np.sum(list(labels_dict.values()))
    keys = labels_dict.keys()
    class_weight = dict()
    
    for key in keys:
        score = math.log(mu*total/float(labels_dict[key]))
        class_weight[key] = score if score > 1.0 else 1.0
    
    return class_weight

In [22]:
y_train.value_counts()

0    6265
1    1042
Name: Response, dtype: int64

In [23]:
labels_dict = {0: y_train.value_counts()[0], 1: y_train.value_counts()[1]}

In [24]:
class_weight = create_class_weight(labels_dict)

In [25]:
class_weight

{0: 1.0, 1: 1.1037207791833765}

In [26]:
names = [
    "Nearest Neighbors",
    "XGBoost",
    "Random Forest"
]

In [27]:
classifiers = [
    KNeighborsClassifier(3),
    GradientBoostingClassifier(),
    RandomForestClassifier(class_weight=class_weight)
]

In [28]:
scores = {}

# iterate over classifiers
for name, clf in zip(names, classifiers):
    # Cross val prediction    
    cv_preds = cross_val_predict(clf, X_train_processed, y_train, method='predict_proba')
    cv_score = log_loss(y_train, cv_preds)
    
    # holdout data 
    clf.fit(X_train_processed, y_train)
    hd_preds = clf.predict_proba(X_test_processed)
    hd_score = log_loss(y_test, hd_preds)
    
    # append the scores
    scores[name] = [cv_score, hd_score]
    #store the model

In [29]:
scores

{'Nearest Neighbors': [0.6210403921626958, 0.6183440196177838],
 'XGBoost': [0.26508862391305427, 0.27065039179988165],
 'Random Forest': [0.09847746670067138, 0.07681328898726836]}

In [30]:
for model, perf in scores.items():
    print(f"{model} => CV_AUC :  {perf[0]}, Holdout_AUC:{perf[1]}")

Nearest Neighbors => CV_AUC :  0.6210403921626958, Holdout_AUC:0.6183440196177838
XGBoost => CV_AUC :  0.26508862391305427, Holdout_AUC:0.27065039179988165
Random Forest => CV_AUC :  0.09847746670067138, Holdout_AUC:0.07681328898726836


In [31]:
param_grid = [
    {
        'n_estimators': [100, 200],
        'n_jobs': [-1]
    },
    {
        'n_estimators': [50, 100, 200],
        'max_features': ['log2'],
        'n_jobs': [-1]
    },
    {
        'bootstrap': [False],
        'n_estimators': [50, 100, 150, 300],
        'max_features': [2, 3, 5, 7, 11],
        'n_jobs': [-1]
    },
]

In [32]:
scorer = make_scorer(log_loss)

In [33]:
RF = RandomForestClassifier(class_weight=class_weight)

In [34]:
grid_search = GridSearchCV(
    RF,
    param_grid,
    cv=10,
    scoring=scorer,
    return_train_score=True
)

In [35]:
%%timeit

grid_search.fit(X_train_processed, y_train)

3min 43s ± 9.96 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [47]:
grid_search.cv_results_

{'mean_fit_time': array([0.36118982, 0.51271904, 0.25727394, 0.35094037, 0.55161779,
        0.26233773, 0.3924849 , 0.62279427, 1.18965573, 0.42746875,
        0.66012888, 0.93162408, 1.67213814, 0.4616039 , 0.60583453,
        0.87529261, 1.61469913, 0.4766938 , 0.57346642, 0.80620775,
        1.43380997, 0.44590907, 0.48444619, 0.57753236, 0.86647167]),
 'std_fit_time': array([0.03405617, 0.0291258 , 0.02361301, 0.0555283 , 0.04106201,
        0.01433011, 0.02729905, 0.04507722, 0.15791194, 0.03404229,
        0.06787833, 0.06610445, 0.098748  , 0.0239368 , 0.05392378,
        0.07609359, 0.12057662, 0.06336212, 0.03877612, 0.0392023 ,
        0.14975769, 0.07512644, 0.04367939, 0.05527899, 0.03942214]),
 'mean_score_time': array([0.03291688, 0.05405316, 0.02020741, 0.03357511, 0.05909958,
        0.01929958, 0.03250132, 0.04479823, 0.0814081 , 0.02270174,
        0.04218967, 0.04667428, 0.08289676, 0.01977758, 0.03388798,
        0.04449854, 0.08502469, 0.02209685, 0.03224614, 0.04

In [36]:
grid_search.best_score_

0.4301311628094727

In [37]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 2, 'n_estimators': 50, 'n_jobs': -1}

In [38]:
grid_search.n_features_in_

122

In [39]:
# grid_search.cv_results_

In [40]:
sk_best = grid_search.best_estimator_

In [41]:
sk_best

RandomForestClassifier(bootstrap=False,
                       class_weight={0: 1.0, 1: 1.1037207791833765},
                       max_features=2, n_estimators=50, n_jobs=-1)

In [42]:
with open("model\\best.joblib", 'wb') as output:
        pickle.dump(sk_best, output, pickle.HIGHEST_PROTOCOL)

In [43]:
##loading the model from the saved file
with open('model\\best.joblib', 'rb') as input:
    model = pickle.load(input)

In [44]:
# Cross val prediction    
cv_one_preds = cross_val_predict(
    model,
    X_train_processed,
    y_train,
    method='predict'
)

In [45]:
# create perormances dictionary
perf = {'accuracy' : accuracy_score(y_train, cv_one_preds),
       'precision': precision_score(y_train, cv_one_preds),
       'recall' : recall_score(y_train, cv_one_preds),
       'f1_score': f1_score(y_train, cv_one_preds)}

In [46]:
# persist the result
output_path = "model\\sk_best_performances.pkl"
with open(output_path, 'wb') as output:
        pickle.dump(perf, output, pickle.HIGHEST_PROTOCOL)