In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

#from transformers import ColumnNamePurger
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier


import sys
sys.path.insert(0, '../src')

from sklearn import set_config
set_config(display='diagram')

from sklearn import set_config
set_config(transform_output='pandas')

from feature_engine.discretisation import EqualFrequencyDiscretiser
from pipeline_tools import generate_user_input_pipeline
from joblib import load


train = pd.read_csv('../data/aug_train.csv')
train, valid = train_test_split(train, test_size=0.2, random_state=42, stratify=train['Response'])

display(train.head())
train.info()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
97017,142046,Female,23,1,6.0,1,< 1 Year,Yes,40694.0,152.0,148,0
150461,312938,Male,29,1,36.0,1,< 1 Year,No,2630.0,160.0,257,0
277654,470466,Female,24,1,35.0,1,< 1 Year,No,26622.0,152.0,183,0
139225,380625,Female,43,1,8.0,0,1-2 Year,Yes,44543.0,26.0,117,0
374283,448542,Male,25,1,28.0,1,< 1 Year,No,42321.0,152.0,72,0


<class 'pandas.core.frame.DataFrame'>
Index: 305723 entries, 97017 to 244010
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    305723 non-null  int64  
 1   Gender                305723 non-null  object 
 2   Age                   305723 non-null  int64  
 3   Driving_License       305723 non-null  int64  
 4   Region_Code           305723 non-null  float64
 5   Previously_Insured    305723 non-null  int64  
 6   Vehicle_Age           305723 non-null  object 
 7   Vehicle_Damage        305723 non-null  object 
 8   Annual_Premium        305723 non-null  float64
 9   Policy_Sales_Channel  305723 non-null  float64
 10  Vintage               305723 non-null  int64  
 11  Response              305723 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 30.3+ MB


In [2]:
# Template for user input
user_input_raw = {
    "Policy_Sales_Channel": "26.0",
    "Region_Code": "28.0",
    "Age": "44",
    "Previously_Insured": "1",
    "Vehicle_Age": "< 1 Year",
    "Vehicle_Damage": "No", 
    "Annual_Premium": "40454.0",
    "Gender": "Female",
}

user_input_raw['Policy_Sales_Channel'] = float(user_input_raw['Policy_Sales_Channel'])
user_input_raw['Region_Code'] = float(user_input_raw['Region_Code'])
user_input_raw['Age'] = int(user_input_raw['Age'])
user_input_raw['Previously_Insured'] = int(user_input_raw['Previously_Insured'])
user_input_raw['Annual_Premium'] = float(user_input_raw['Annual_Premium'])
user_input_df = pd.DataFrame(user_input_raw, index=[0])

preprocessing_pipeline = generate_user_input_pipeline(train)
trained_preprocessed_pipeline = preprocessing_pipeline.fit(train)
preprocessed_user_input = trained_preprocessed_pipeline.transform(user_input_df)

In [3]:
# Creating the dataset on which the the app model will be trained

smote = SMOTE(random_state=42)
train_resampled = preprocessing_pipeline.transform(train.drop(columns='Response'))
X_resampled, y_resampled = smote.fit_resample(train_resampled, train['Response'])
#col_name_purger = ColumnNamePurger()
#X_resampled = col_name_purger.fit_transform(X_resampled)

best_knn = KNeighborsClassifier(n_neighbors=7, weights='uniform', p=2)
best_gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1)
best_sgd = SGDClassifier(alpha=0.001, learning_rate='optimal', loss='modified_huber', max_iter=1000, penalty='elasticnet')

voting_clf = VotingClassifier(
    estimators=[('GBC', best_gb), ('KNN', best_knn), ('SGD', best_sgd)],
    voting='soft',
    n_jobs=-1,
    verbose=3
)

voting_clf.fit(X_resampled, y_resampled)
pred = voting_clf.predict(preprocessed_user_input)
pred

[Voting] ...................... (2 of 3) Processing KNN, total=   0.5s
[Voting] ...................... (3 of 3) Processing SGD, total=   0.7s
[Voting] ...................... (1 of 3) Processing GBC, total= 1.3min


array([0])

In [8]:
from joblib import dump, load
dump(voting_clf, '../models/vc_knn_gbc_sgd_standalone_balanced_trained.joblib1')
dump(preprocessing_pipeline, '../models/user_input_preprocessing_pipeline.joblib1')


['../models/user_input_preprocessing_pipeline.joblib1']

In [9]:
import requests
from joblib import load

loaded_voting_clf = load('../models/vc_knn_gbc_sgd_standalone_balanced_trained.joblib')
loaded_preprocessing_pipeline = load('../models/user_input_preprocessing_pipeline.joblib')

# Select a subset of positive responses only and make predictions with the loaded model
positive_responses = train[train['Response'] == 1]
data_subset = positive_responses.sample(n=100)

X = data_subset.drop(columns='Response')
X_preprocessed = loaded_preprocessing_pipeline.transform(X)
y_pred_loaded_model = loaded_voting_clf.predict(X_preprocessed)

y_pred_live_model = voting_clf.predict(X_preprocessed)

y_true = data_subset['Response']


# Make predictions with the Flask model
endpoint = "http://localhost:3000/api/predict"
requests_data = [row.drop('Response', errors='ignore').to_dict() for _, row in data_subset.iterrows()]
responses = [requests.post(endpoint, json=data).json() for data in requests_data]
y_pred_flask_model = [response['prediction'] for response in responses]

# Create a DataFrame with true labels, predicted labels from all models, and their corresponding indices
comparison_df = pd.DataFrame({
    'True Label': y_true,
    'Predicted Label (Realtime Model)': y_pred_live_model,
    'Predicted Label (Joblib Loaded Model)': y_pred_loaded_model,
    'Predicted Label (Remote Model)': y_pred_flask_model
}, index=data_subset.index)

same_model_discrepencies = comparison_df.loc[comparison_df['Predicted Label (Realtime Model)'] != comparison_df['Predicted Label (Remote Model)']]
print(f"Discrepancies between the notebook model and the Flask model predictions: {same_model_discrepencies.shape[0]}")

y_true_vs_y_pred_flask = comparison_df.loc[comparison_df['True Label'] != comparison_df['Predicted Label (Remote Model)']]
print(f"Discrepancies between the true label and the Flask model: {y_true_vs_y_pred_flask.shape[0]}")

Discrepancies between the notebook model and the Flask model predictions: 2
Discrepancies between the true label and the Flask model: 7


In [5]:
comparison_df.head(10)

Unnamed: 0,True Label,Predicted Label (Realtime Model),Predicted Label (Joblib Loaded Model),Predicted Label (Remote Model)
237404,1,1,1,1
226781,1,1,1,1
165421,1,1,1,1
239595,1,1,1,1
251212,1,1,1,1
97070,1,1,1,1
265486,1,1,1,1
245811,1,1,1,1
201665,1,1,1,1
61956,1,1,1,1
