# **Model Testing**



---
---
## **1. Setting Up**
---
---

In [1]:
import sys
sys.path.insert(0, '../src')

import pipeline_tools as pt
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.compose import make_column_transformer

from sklearn.preprocessing import FunctionTransformer, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, label_binarize

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from feature_engine.discretisation import EqualFrequencyDiscretiser

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier 
from sklearn.metrics import roc_curve, auc

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.metrics import classification_report_imbalanced

import joblib


balanced_vc = joblib.load('../models/vc_knn_gbc_sgd_pipeline_balanced_trained.joblib')

train = pd.read_csv('../data/aug_train.csv')
test = pd.read_csv('../data/aug_test.csv')
y_true = pd.Series(np.load('../data/answer.npy'))

display(test.head())
print(test.info())



### Preprocessing pipeline

gender_pipeline = make_pipeline(
    OrdinalEncoder(categories=[['Male', 'Female']]),
)
    
vehicle_damage_pipeline = make_pipeline(
    OrdinalEncoder(categories=[['No', 'Yes']]),
)

vehicle_age_pipeline = make_pipeline(
    OrdinalEncoder(categories=[sorted(train['Vehicle_Age'].unique())]),

)

region_code_pipeline = make_pipeline(
    EqualFrequencyDiscretiser(q=4)
)

policy_sales_channel_pipeline = make_pipeline(
    EqualFrequencyDiscretiser(q=4)
)

continuous_pipeline = make_pipeline(
    RobustScaler(),
)

age_pipeline = make_pipeline(
    StandardScaler(),
)

iterative_imputer = IterativeImputer()

column_transformer = make_column_transformer(
    (gender_pipeline, ['Gender']),
    (vehicle_damage_pipeline, ['Vehicle_Damage']),
    (vehicle_age_pipeline, ['Vehicle_Age']),
    (region_code_pipeline, ['Region_Code']),
    (policy_sales_channel_pipeline, ['Policy_Sales_Channel']),
    (continuous_pipeline, ['Annual_Premium']),
    (age_pipeline, ['Age']),
    ('passthrough', ['Driving_License', 'Previously_Insured',]),
)

feature_engineering_pipeline = make_pipeline(column_transformer, 
                                             iterative_imputer,
                                             VarianceThreshold(threshold=0.1)
)

### Voting Classifier 

best_knn = KNeighborsClassifier(n_neighbors=7, weights='uniform', p=2)
best_gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1)
best_sgd = SGDClassifier(alpha=0.001, learning_rate='optimal', loss='modified_huber', max_iter=1000, penalty='elasticnet')

voting_clf = VotingClassifier(
    estimators=[('GBC', best_gb), ('KNN', best_knn), ('SGD', best_sgd)],
    voting='soft',
    n_jobs=-1,
    verbose=3
)

vc_clf_pipeline = Pipeline(steps=[
    ('preprocessor', feature_engineering_pipeline),
    ('classifier', voting_clf)
])


### Imbalanced pipeline


imb_fe_pipeline = imbPipeline([
    ('column_transformer', column_transformer),
    ('variance_threshold', VarianceThreshold(threshold=0.09))
])

imb_vc_clf_pipeline = imbPipeline(steps=[
    ('sampler', SMOTE(random_state=42)),  # Placeholder, this will be replaced by GridSearchCV
    ('classifier', voting_clf)
])

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,57782,Female,34,1,39.0,1,1-2 Year,No,38244.0,124.0,146
1,286811,Female,55,1,28.0,0,> 2 Years,Yes,37577.0,122.0,109
2,117823,Male,39,1,28.0,1,1-2 Year,No,24578.0,26.0,63
3,213992,Male,28,1,50.0,1,1-2 Year,No,40507.0,8.0,129
4,324756,Female,24,1,10.0,0,< 1 Year,Yes,36783.0,152.0,201


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78273 entries, 0 to 78272
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    78273 non-null  int64  
 1   Gender                78273 non-null  object 
 2   Age                   78273 non-null  int64  
 3   Driving_License       78273 non-null  int64  
 4   Region_Code           78273 non-null  float64
 5   Previously_Insured    78273 non-null  int64  
 6   Vehicle_Age           78273 non-null  object 
 7   Vehicle_Damage        78273 non-null  object 
 8   Annual_Premium        78273 non-null  float64
 9   Policy_Sales_Channel  78273 non-null  float64
 10  Vintage               78273 non-null  int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 6.6+ MB
None


In [2]:
fit_train = feature_engineering_pipeline.fit_transform(train.drop(columns='Response'))
imb_vc_clf_pipeline.fit(fit_train, train['Response'])

fit_test = feature_engineering_pipeline.transform(test)

[Voting] ...................... (2 of 3) Processing KNN, total=   0.7s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.9s
[Voting] ...................... (1 of 3) Processing GBC, total= 2.0min


---
---
## **3. Model Evaluation**
---
---

### **Hard Predictions**

In [3]:

test = pd.read_csv('../data/aug_test.csv')
X_test = feature_engineering_pipeline.transform(test)
preds = imb_vc_clf_pipeline.predict(X_test)

y_true = pd.Series(np.load('../data/answer.npy'))

report = classification_report_imbalanced(y_true, preds)

print(report)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.76      0.90      0.85      0.82      0.67     65455
          1       0.42      0.90      0.76      0.57      0.82      0.69     12818

avg / total       0.88      0.78      0.88      0.81      0.82      0.67     78273



---
### **Probability Predictions**

In [6]:
import viz

y_pred_proba = imb_vc_clf_pipeline.predict_proba(X_test)[:,1]

viz.plot_roc_curve(y_true, y_pred_proba, 'Voting Classifier')  

In [11]:
y_pred_proba_thresh = (y_pred_proba >= 0.39).astype('int')
print(classification_report_imbalanced(y_true, y_pred_proba_thresh, target_names=['0', '1']))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.70      0.95      0.82      0.82      0.65     65455
          1       0.39      0.95      0.70      0.55      0.82      0.69     12818

avg / total       0.89      0.74      0.91      0.78      0.82      0.66     78273

