# **Balanced Model Building**



---
---
## **1. Setting Up**
---
---

In [1]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline

from sklearn.preprocessing import FunctionTransformer, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.pipeline import Pipeline as imbPipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser
from imblearn.metrics import classification_report_imbalanced


from transformers import IntToFloatTransformer



import pandas as pd
from joblib import dump, load




from sklearn import set_config
set_config(display='diagram')

from sklearn import set_config
set_config(transform_output='pandas')

import joblib


%matplotlib inline
sns.set()

In [2]:
train = pd.read_csv('../data/aug_train.csv')
#train, valid = train_test_split(train, test_size=0.2, random_state=42)

display(train.head())
train.info()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,167647,Male,22,1,7.0,1,< 1 Year,No,2630.0,152.0,16,0
1,17163,Male,42,1,28.0,0,1-2 Year,Yes,43327.0,26.0,135,0
2,32023,Female,66,1,33.0,0,1-2 Year,Yes,35841.0,124.0,253,0
3,87447,Female,22,1,33.0,0,< 1 Year,No,27645.0,152.0,69,0
4,501933,Male,28,1,46.0,1,< 1 Year,No,29023.0,152.0,211,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382154 entries, 0 to 382153
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    382154 non-null  int64  
 1   Gender                382154 non-null  object 
 2   Age                   382154 non-null  int64  
 3   Driving_License       382154 non-null  int64  
 4   Region_Code           382154 non-null  float64
 5   Previously_Insured    382154 non-null  int64  
 6   Vehicle_Age           382154 non-null  object 
 7   Vehicle_Damage        382154 non-null  object 
 8   Annual_Premium        382154 non-null  float64
 9   Policy_Sales_Channel  382154 non-null  float64
 10  Vintage               382154 non-null  int64  
 11  Response              382154 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 35.0+ MB


---
---
## **2. Grid Search: Resampling Methods**
---
---

In [3]:
# Preprocessing pipelines
gender_pipeline = make_pipeline(OrdinalEncoder(categories=[['Male', 'Female']]))
vehicle_damage_pipeline = make_pipeline(OrdinalEncoder(categories=[['No', 'Yes']]))
vehicle_age_pipeline = make_pipeline(OrdinalEncoder(categories=[sorted(train['Vehicle_Age'].unique())]))
region_code_pipeline = make_pipeline(EqualFrequencyDiscretiser(q=4))
policy_sales_channel_pipeline = make_pipeline(EqualFrequencyDiscretiser(q=4))
continuous_pipeline = make_pipeline(RobustScaler())
age_pipeline = make_pipeline(StandardScaler())
binaries_pipeline = make_pipeline(IntToFloatTransformer())

# Column transformer
column_transformer = make_column_transformer(
    (gender_pipeline, ['Gender']),
    (vehicle_damage_pipeline, ['Vehicle_Damage']),
    (vehicle_age_pipeline, ['Vehicle_Age']),
    (region_code_pipeline, ['Region_Code']),
    (policy_sales_channel_pipeline, ['Policy_Sales_Channel']),
    (continuous_pipeline, ['Annual_Premium']),
    (age_pipeline, ['Age']),
    (binaries_pipeline, ['Driving_License', 'Previously_Insured',]),
)

# Feature engineering pipeline
feature_engineering_pipeline = make_pipeline(column_transformer, 
                                             IterativeImputer(),
                                             VarianceThreshold(threshold=0.1))

# Voting classifier
best_knn = KNeighborsClassifier(n_neighbors=7, weights='uniform', p=2)
best_gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1)
best_sgd = SGDClassifier(alpha=0.001, learning_rate='optimal', loss='modified_huber', max_iter=1000, penalty='elasticnet')

voting_clf = VotingClassifier(
    estimators=[('GBC', best_gb), ('KNN', best_knn), ('SGD', best_sgd)],
    voting='soft',
    n_jobs=-1,
    verbose=3
)

In [4]:
# Imbalanced pipeline for feature engineering
imb_fe_pipeline = imbPipeline([
    ('column_transformer', column_transformer),
    ('variance_threshold', VarianceThreshold(threshold=0.09))
])

# Transform the training data
X = imb_fe_pipeline.fit_transform(train.drop(columns='Response'))
y = train['Response']

# Imbalanced pipeline for classifier
imb_vc_clf_pipeline = imbPipeline(steps=[
    ('sampler', SMOTE(random_state=42)),  # Placeholder, this will be replaced by GridSearchCV
    ('classifier', voting_clf)
])

param_grid = {
    'sampler': [SMOTE(random_state=42), ADASYN(random_state=42), RandomUnderSampler(random_state=42), NearMiss(version=1)]
}


grid_search = GridSearchCV(imb_vc_clf_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=0)
grid_search.fit(X, y)

[Voting] ...................... (2 of 3) Processing KNN, total=   0.2s
[Voting] ...................... (2 of 3) Processing KNN, total=   0.2s
[Voting] ...................... (2 of 3) Processing KNN, total=   0.2s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.4s
[Voting] ...................... (2 of 3) Processing KNN, total=   0.1s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.5s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.5s
[Voting] ...................... (2 of 3) Processing KNN, total=   0.2s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.4s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.6s
[Voting] ...................... (2 of 3) Processing KNN, total=   1.3s
[Voting] ...................... (3 of 3) Processing SGD, total=   1.7s
[Voting] ...................... (2 of 3) Processing KNN, total=   1.7s
[Voting] ...................... (2 of 3) Processing KNN, total=   1.7s
[Votin

# Transform the training data
X = imb_fe_pipeline.fit_transform(train.drop(columns='Response'))
y = train['Response']

# Define the pipeline for GridSearchCV
imb_vc_clf_pipeline = imbPipeline(steps=[
    ('sampler', None),  # Placeholder, this will be replaced by GridSearchCV
    ('classifier', voting_clf)
])

# Define the parameter grid
param_grid = {
    'sampler': [SMOTE(random_state=42), ADASYN(random_state=42), RandomUnderSampler(random_state=42), NearMiss(version=1)]
}

# Run GridSearchCV
grid_search = GridSearchCV(imb_vc_clf_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=0)
grid_search.fit(X, y)

In [5]:
results = pd.DataFrame(grid_search.cv_results_)
results['sampler_name'] = results['param_sampler'].apply(lambda x: x.__class__.__name__)
results = results.sort_values(by='mean_test_score', ascending=False)

results.to_csv('../data/grid_search_results_imb.csv', index=False)
joblib.dump(grid_search, '../models/imb_grid_search_imb.joblib')

results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sampler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,sampler_name
0,178.775544,4.244714,4.638939,1.064915,SMOTE(random_state=42),{'sampler': SMOTE(random_state=42)},0.567377,0.57027,0.568506,0.567613,0.567319,0.568217,0.001111,1,SMOTE
1,169.690202,6.279578,6.728414,1.577259,ADASYN(random_state=42),{'sampler': ADASYN(random_state=42)},0.560531,0.56363,0.561815,0.557593,0.561562,0.561026,0.001986,2,ADASYN
2,23.119054,0.459865,8.130846,0.623625,RandomUnderSampler(random_state=42),{'sampler': RandomUnderSampler(random_state=42)},0.553308,0.555187,0.557489,0.553292,0.549944,0.553844,0.002486,3,RandomUnderSampler
3,34.156299,1.044254,8.267622,1.664078,NearMiss(),{'sampler': NearMiss()},0.207443,0.208992,0.213948,0.218337,0.213965,0.212537,0.003903,4,NearMiss


# cheking the
coco = feature_engineering_pipeline.fit_transform(train.drop(columns='Response'))
imb_vc_clf_pipeline.fit(coco, train['Response'])

valid_transformed = imb_fe_pipeline.transform(valid.drop(columns='Response'))
y_pred_proba = grid_search.best_estimator_.predict_proba(valid_transformed)


from imblearn.metrics import classification_report_imbalanced
tt = y_pred_proba[:, 1] > 0.33
print(classification_report_imbalanced(valid['Response'], tt))


---
---
## **3. Tri-Model Voting Classifier: Training on a Resampled Dataset**
---
---

In [6]:
from transformers import ColumnNamePurger

## CREATING THE RESAMPLED DATASET
data_resampling_pipeline = imbPipeline(steps=[
    ('fe_pipeline', imb_fe_pipeline),
    ('sampler', SMOTE(random_state=42)), 
    
])
smote = SMOTE(random_state=42)

print(train.shape)
train_resampled = feature_engineering_pipeline.fit_transform(train.drop(columns='Response'))
X_resampled, y_resampled = smote.fit_resample(train_resampled, train['Response'])

#col_name_purger = ColumnNamePurger()
#X_resampled = col_name_purger.fit_transform(X_resampled)


display(X_resampled.head())
X_resampled.columns




(382154, 12)


Unnamed: 0,pipeline-1__Gender,pipeline-2__Vehicle_Damage,pipeline-3__Vehicle_Age,pipeline-4__Region_Code,pipeline-5__Policy_Sales_Channel,pipeline-6__Annual_Premium,pipeline-7__Age,pipeline-8__Previously_Insured
0,0.0,0.0,1.0,0.0,2.0,-1.950241,-1.086611,1.0
1,0.0,1.0,0.0,1.0,0.0,0.780781,0.226856,0.0
2,1.0,1.0,0.0,2.0,1.0,0.278424,1.803016,0.0
3,1.0,0.0,1.0,2.0,2.0,-0.271579,-1.086611,0.0
4,0.0,0.0,1.0,3.0,2.0,-0.179106,-0.692571,1.0


Index(['pipeline-1__Gender', 'pipeline-2__Vehicle_Damage',
       'pipeline-3__Vehicle_Age', 'pipeline-4__Region_Code',
       'pipeline-5__Policy_Sales_Channel', 'pipeline-6__Annual_Premium',
       'pipeline-7__Age', 'pipeline-8__Previously_Insured'],
      dtype='object')

In [7]:
# Training the model on the resampled dataset
voting_clf = VotingClassifier(
    estimators=[('GBC', best_gb), ('KNN', best_knn), ('SGD', best_sgd)],
    voting='soft',
    n_jobs=-1,
    verbose=3
)

voting_clf.fit(X_resampled, y_resampled)

[Voting] ...................... (2 of 3) Processing KNN, total=   0.7s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.9s
[Voting] ...................... (1 of 3) Processing GBC, total= 1.6min


In [8]:
"""
from joblib import dump
dump(voting_clf, '../models/vc_knn_gbc_sgd_standalone_balanced_trained.joblib1')
"""

"\nfrom joblib import dump\ndump(voting_clf, '../models/vc_knn_gbc_sgd_standalone_balanced_trained.joblib1')\n"

In [9]:
import numpy as np
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go
import plotly.express as px

def plot_roc_curve(y_test, y_pred, title=''):
    """
    Plot ROC curve.
    
    Compute the ROC curve and ROC area using sklearn's 
    roc_curve and auc functions. Create a plotly figure with the ROC curve
    and a diagonal reference line. Update the layout for titles, axes,
    size. Show or save the figure based on input args.
    """
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)

    fig = go.Figure()

    color = px.colors.qualitative.Set3[0]
    
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = {0:0.2f})'.format(roc_auc), line=dict(color=color)))
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(color='black', dash='dash')))

    fig.update_layout(title='ROC Curve: '+title, xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', autosize=False, width=600, height=600, margin=dict(l=50, r=50, b=100, t=100, pad=4), plot_bgcolor='whitesmoke')
    fig.update_layout(height=700, width=1000)
    
    fig.show()
    
#valid_transformed = feature_engineering_pipeline.transform(train.drop(columns='Response'))
y_pred_proba_class_1 = voting_clf.predict_proba(X_resampled)[:, 1]

plot_roc_curve(y_resampled,  y_pred_proba_class_1, title='Voting Classifier')

In [11]:
from imblearn.metrics import classification_report_imbalanced

# Your existing code
test = pd.read_csv('../data/aug_test.csv')
X_test = feature_engineering_pipeline.transform(test)
preds = voting_clf.predict(X_test)

y_true = pd.Series(np.load('../data/answer.npy'))

report = classification_report_imbalanced(y_true, preds)

print(report)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.76      0.90      0.85      0.83      0.67     65455
          1       0.42      0.90      0.76      0.57      0.83      0.69     12818

avg / total       0.88      0.78      0.88      0.81      0.83      0.67     78273

