# **Balanced Model Building**



---
---
## **1. Setting Up**
---
---

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.compose import make_column_transformer

from sklearn.preprocessing import FunctionTransformer, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from feature_engine.discretisation import EqualFrequencyDiscretiser

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier 

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import IterativeImputer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, RobustScaler, StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser


import pandas as pd
from joblib import dump, load


import sys
sys.path.insert(0, '../src')

from sklearn import set_config
set_config(display='diagram')

from sklearn import set_config
set_config(transform_output='pandas')

import joblib


%matplotlib inline
sns.set()

In [13]:
train = pd.read_csv('../data/aug_train.csv')
train, valid = train_test_split(train, test_size=0.2, random_state=42)

display(train.head())
train.info()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
20293,380337,Male,30,1,28.0,1,< 1 Year,No,60954.0,152.0,127,0
33179,282196,Male,26,1,6.0,1,< 1 Year,No,24532.0,152.0,216,0
250682,60095,Male,40,1,0.0,0,1-2 Year,Yes,2630.0,47.0,220,0
323143,124730,Male,25,1,8.0,1,< 1 Year,No,44259.0,152.0,223,0
371317,474060,Female,26,1,28.0,1,< 1 Year,No,33615.0,152.0,194,0


<class 'pandas.core.frame.DataFrame'>
Index: 305723 entries, 20293 to 121958
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    305723 non-null  int64  
 1   Gender                305723 non-null  object 
 2   Age                   305723 non-null  int64  
 3   Driving_License       305723 non-null  int64  
 4   Region_Code           305723 non-null  float64
 5   Previously_Insured    305723 non-null  int64  
 6   Vehicle_Age           305723 non-null  object 
 7   Vehicle_Damage        305723 non-null  object 
 8   Annual_Premium        305723 non-null  float64
 9   Policy_Sales_Channel  305723 non-null  float64
 10  Vintage               305723 non-null  int64  
 11  Response              305723 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 30.3+ MB


---
---
## **2. Grid Search: Resampling Methods**
---
---

In [18]:
import pipeline_tools as pt
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss


gender_pipeline = make_pipeline(
    OrdinalEncoder(categories=[['Male', 'Female']]),
)
    
vehicle_damage_pipeline = make_pipeline(
    OrdinalEncoder(categories=[['No', 'Yes']]),
)

vehicle_age_pipeline = make_pipeline(
    OrdinalEncoder(categories=[sorted(train['Vehicle_Age'].unique())]),

)

region_code_pipeline = make_pipeline(
    EqualFrequencyDiscretiser(q=4)
)

policy_sales_channel_pipeline = make_pipeline(
    EqualFrequencyDiscretiser(q=4)
)

continuous_pipeline = make_pipeline(
    RobustScaler(),
)

age_pipeline = make_pipeline(
    StandardScaler(),
)

iterative_imputer = IterativeImputer()

column_transformer = make_column_transformer(
    (gender_pipeline, ['Gender']),
    (vehicle_damage_pipeline, ['Vehicle_Damage']),
    (vehicle_age_pipeline, ['Vehicle_Age']),
    (region_code_pipeline, ['Region_Code']),
    (policy_sales_channel_pipeline, ['Policy_Sales_Channel']),
    (continuous_pipeline, ['Annual_Premium']),
    (age_pipeline, ['Age']),
    ('passthrough', ['Driving_License', 'Previously_Insured',]),
)

feature_engineering_pipeline = make_pipeline(column_transformer, 
                                             iterative_imputer,
                                             VarianceThreshold(threshold=0.1)
)

best_knn = KNeighborsClassifier(n_neighbors=7, weights='uniform', p=2)
best_gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1)
best_sgd = SGDClassifier(alpha=0.001, learning_rate='optimal', loss='modified_huber', max_iter=1000, penalty='elasticnet')

voting_clf = VotingClassifier(
    estimators=[('GBC', best_gb), ('KNN', best_knn), ('SGD', best_sgd)],
    voting='soft',
    n_jobs=-1,
    verbose=3
)

vc_clf_pipeline = Pipeline(steps=[
    ('preprocessor', feature_engineering_pipeline),
    ('classifier', voting_clf)
])

imb_fe_pipeline = imbPipeline([
    ('column_transformer', column_transformer),
    ('variance_threshold', VarianceThreshold(threshold=0.09))
])

X = imb_fe_pipeline.fit_transform(train.drop(columns='Response'))
y = train['Response']


imb_vc_clf_pipeline = imbPipeline(steps=[
    ('sampler', SMOTE(random_state=42)),  # Placeholder, this will be replaced by GridSearchCV
    ('classifier', voting_clf)
])

param_grid = {
    'sampler': [SMOTE(random_state=42), ADASYN(random_state=42), RandomUnderSampler(random_state=42), NearMiss(version=1)]
}

#rid_search = GridSearchCV(imb_vc_clf_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=3)

#grid_search.fit(X, y)

#print(grid_search.best_params_)



In [20]:

# Resample the training data using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create the new pipeline
vc_clf_pipeline = Pipeline(steps=[
    ('preprocessor', feature_engineering_pipeline),
    ('classifier', voting_clf)
])

# Fit the pipeline to the resampled training data
vc_clf_pipeline.fit(X_resampled, y_resampled)

ValueError: A given column is not a column of the dataframe

In [5]:
print(grid_search.best_params_)

results = pd.DataFrame(grid_search.cv_results_)
results['sampler_name'] = results['param_sampler'].apply(lambda x: x.__class__.__name__)
results = results.sort_values(by='mean_test_score', ascending=False)

results.to_csv('../data/grid_search_results_imb.csv', index=False)
joblib.dump(grid_search, '../models/grid_search_imb.pkl')

results.head(5)

{'sampler': SMOTE(random_state=42)}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sampler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,sampler_name
0,134.616035,3.857972,3.795078,0.864664,SMOTE(random_state=42),{'sampler': SMOTE(random_state=42)},0.56813,0.567526,0.570966,0.566752,0.568807,0.568436,0.001435,1,SMOTE
1,129.92986,4.70344,4.813733,1.140122,ADASYN(random_state=42),{'sampler': ADASYN(random_state=42)},0.561276,0.562377,0.560908,0.559537,0.559216,0.560663,0.00116,2,ADASYN
2,16.929423,1.003869,5.240369,0.622713,RandomUnderSampler(random_state=42),{'sampler': RandomUnderSampler(random_state=42)},0.555271,0.552155,0.557074,0.553605,0.556716,0.554964,0.001863,3,RandomUnderSampler
3,23.751108,0.695694,5.837084,0.409388,NearMiss(),{'sampler': NearMiss()},0.22029,0.210311,0.207894,0.209944,0.209998,0.211687,0.004386,4,NearMiss


In [6]:
valid_transformed = imb_fe_pipeline.transform(valid.drop(columns='Response'))
y_pred_proba = grid_search.best_estimator_.predict_proba(valid_transformed)


In [7]:
from imblearn.metrics import classification_report_imbalanced
tt = y_pred_proba[:, 1] > 0.33
print(classification_report_imbalanced(valid['Response'], tt))


                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.67      0.98      0.80      0.81      0.63     63789
          1       0.37      0.98      0.67      0.54      0.81      0.67     12642

avg / total       0.89      0.72      0.93      0.76      0.81      0.64     76431



---
---
## **3. Tri-Model Voting Classifier: Training on a Resampled Dataset**
---
---

In [10]:
print(grid_search.best_params_)
imb_vc_clf_pipeline = imbPipeline(steps=[
    ('sampler', SMOTE(random_state=42)),  # Placeholder, this will be replaced by GridSearchCV
    ('classifier', voting_clf)
])
imb_vc_clf_pipeline.fit(X, y)


{'sampler': SMOTE(random_state=42)}
[Voting] ...................... (2 of 3) Processing KNN, total=   0.5s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.7s
[Voting] ...................... (1 of 3) Processing GBC, total= 1.3min


In [None]:
prep_valid = prep_pipeline.transform(valid.drop(columns='Response'))
y_pred = imb_vc_clf_pipeline.predict(prep_valid)
print(classification_report_imbalanced(valid['Response'], y_pred))

In [13]:
# Fit the pipeline on the training data
imb_fe_pipeline.fit(train.drop(columns='Response'))

# Transform both training and validation data
X = imb_fe_pipeline.transform(train.drop(columns='Response'))
valid_transformed = imb_fe_pipeline.transform(valid.drop(columns='Response'))

# Fit the model
grid_search.fit(X, train['Response'])

# Make predictions
y_pred = grid_search.predict(valid_transformed)
print(classification_report_imbalanced(valid['Response'], y_pred))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Voting] ...................... (2 of 3) Processing KNN, total=   0.1s
[Voting] ...................... (2 of 3) Processing KNN, total=   0.1s
[Voting] ...................... (2 of 3) Processing KNN, total=   0.2s
[Voting] ...................... (2 of 3) Processing KNN, total=   0.2s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.4s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.4s
[Voting] ...................... (2 of 3) Processing KNN, total=   0.1s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.5s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.4s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.5s
[Voting] ...................... (2 of 3) Processing KNN, total=   1.1s
[Voting] ...................... (3 of 3) Processing SGD, total=   1.2s
[Voting] ...................... (2 of 3) Processing KNN, total=   1.0s
[Voting] ........

In [16]:
from joblib import dump
dump(grid_search, '../models/vc_knn_gbc_sgd_pipeline_balanced_trained.joblib')

['../models/vc_knn_gbc_sgd_pipeline_balanced_trained.joblib']

In [17]:
import numpy as np
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go
import plotly.express as px

def plot_roc_curve(y_test, y_pred, title=''):
    """
    Plot ROC curve.
    
    Compute the ROC curve and ROC area using sklearn's 
    roc_curve and auc functions. Create a plotly figure with the ROC curve
    and a diagonal reference line. Update the layout for titles, axes,
    size. Show or save the figure based on input args.
    """
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)

    fig = go.Figure()

    color = px.colors.qualitative.Set3[0]
    
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = {0:0.2f})'.format(roc_auc), line=dict(color=color)))
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(color='black', dash='dash')))

    fig.update_layout(title='ROC Curve: '+title, xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', autosize=False, width=600, height=600, margin=dict(l=50, r=50, b=100, t=100, pad=4), plot_bgcolor='whitesmoke')
    fig.update_layout(height=700, width=1000)
    
    fig.show()
    
    
y_pred_proba_class_1 = grid_search.predict_proba(valid_transformed)[:, 1]

plot_roc_curve(valid['Response'], y_pred_proba_class_1, title='Voting Classifier')

In [10]:


gender_pipeline = make_pipeline(OrdinalEncoder(categories=[['Male', 'Female']]))
vehicle_damage_pipeline = make_pipeline(OrdinalEncoder(categories=[['No', 'Yes']]))
vehicle_age_pipeline = make_pipeline(OrdinalEncoder(categories=[sorted(train['Vehicle_Age'].unique())]))
region_code_pipeline = make_pipeline(EqualFrequencyDiscretiser(q=4))
policy_sales_channel_pipeline = make_pipeline(EqualFrequencyDiscretiser(q=4))
continuous_pipeline = make_pipeline(RobustScaler())
age_pipeline = make_pipeline(StandardScaler())

column_transformer = make_column_transformer(
    (gender_pipeline, ['Gender']),
    (vehicle_damage_pipeline, ['Vehicle_Damage']),
    (vehicle_age_pipeline, ['Vehicle_Age']),
    (region_code_pipeline, ['Region_Code']),
    (policy_sales_channel_pipeline, ['Policy_Sales_Channel']),
    (continuous_pipeline, ['Annual_Premium']),
    (age_pipeline, ['Age']),
    ('passthrough', ['Driving_License', 'Previously_Insured']),
)


best_knn = KNeighborsClassifier(n_neighbors=7, weights='uniform', p=2)
best_gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1)
best_sgd = SGDClassifier(alpha=0.001, learning_rate='optimal', loss='modified_huber', max_iter=1000, penalty='elasticnet')

voting_clf = VotingClassifier(
    estimators=[('GBC', best_gb), ('KNN', best_knn), ('SGD', best_sgd)],
    voting='soft',
    n_jobs=-1,
    verbose=3
)

# Define the feature engineering pipeline
feature_engineering_pipeline = imbPipeline([
    ('column_transformer', column_transformer),
    ('imputer', IterativeImputer()),
    ('variance_threshold', VarianceThreshold(threshold=0.1))
])

# Combine the feature engineering pipeline and the voting classifier into a final pipeline
vc_clf_pipeline = imbPipeline(steps=[
    ('preprocessor', feature_engineering_pipeline),
    ('sampler', SMOTE(random_state=42)),
    ('classifier', voting_clf)
])

# Fit the pipeline to the training data
X = train.drop(columns='Response')
y = train['Response']
vc_clf_pipeline.fit(X, y)

# Fit the pipeline to the training data
X = train.drop(columns='Response')
y = train['Response']
vc_clf_pipeline.fit(X, y)
X = train.drop(columns='Response')
y = train['Response']
vc_clf_pipeline.fit(X, y)


dump(vc_clf_pipeline, '../models/vc_clf_pipeline_balanced.joblib')

TypeError: All intermediate steps of the chain should not be Pipelines

In [14]:
from sklearn.base import BaseEstimator, ClassifierMixin

class CustomClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, transformer, classifier):
        self.transformer = transformer
        self.classifier = classifier

    def fit(self, X, y):
        self.transformer.fit(X)
        X_transformed = self.transformer.transform(X)
        self.classifier.fit(X_transformed, y)
        return self

    def predict(self, X):
        X_transformed = self.transformer.transform(X)
        return self.classifier.predict(X_transformed)

# Create the custom classifier
custom_clf = CustomClassifier(imb_fe_pipeline, grid_search)

# Fit the classifier to the training data
X = train.drop(columns='Response')
y = train['Response']
custom_clf.fit(X, y)

NameError: name 'imb_fe_pipeline' is not defined

In [17]:
from imblearn.pipeline import Pipeline as imbPipeline

# Create the new pipeline
vc_clf_pipeline = imbPipeline(steps=[
        ('sampler', SMOTE(random_state=42)),
    ('preprocessor', feature_engineering_pipeline),

    ('classifier', voting_clf)
])

# Fit the pipeline to the training data
X = train.drop(columns='Response')
y = train['Response']
vc_clf_pipeline.fit(X, y)

TypeError: All intermediate steps of the chain should not be Pipelines