In [134]:
import time
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [135]:
data = pd.read_csv('Prediction Insurance.csv')

In [136]:
data.head(1)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1


In [137]:
data['Region_Code'].unique()

array([28,  3, 11, 41, 33,  6, 35, 50, 15, 45,  8, 36, 30, 26, 16, 47, 48,
       19, 39, 23, 37,  5, 17,  2,  7, 29, 46, 27, 25, 13, 18, 20, 49, 22,
       44,  0,  9, 31, 12, 34, 21, 10, 14, 38, 24, 40, 43, 32,  4, 51, 42,
        1, 52], dtype=int64)

In [138]:
#data preprocessing
data = data.drop('id', axis=1)
data['Gender'] = data['Gender'].map({'Male':1, 'Female':0})
data['Vehicle_Damage'] = data['Vehicle_Damage'].map({'Yes':1, 'No':0})

In [139]:
df_region = pd.get_dummies(data['Region_Code'])

In [140]:
df_3 = data[['Gender','Age','Driving_License','Vehicle_Damage','Response']].merge(df_region, left_index=True, right_index=True)

In [141]:
df_3.columns = df_3.columns.astype(str)

In [142]:
df_3.head(1)

Unnamed: 0,Gender,Age,Driving_License,Vehicle_Damage,Response,0,1,2,3,4,...,43,44,45,46,47,48,49,50,51,52
0,1,44,1,1,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [143]:
#data modelling using Logistic Regression Balanced Weight
x = df_3.drop('Response', axis=1)
y = df_3['Response']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [144]:
start = time.time()
model = LogisticRegression(solver='liblinear', class_weight='balanced')
model.fit(x_train, y_train)
stop = time.time()
print(f"Training Time {stop-start} seconds")

Training Time 1.4960012435913086 seconds


In [145]:
y_predict = model.predict(x_test)
print(classification_report(y_predict, y_test))

              precision    recall  f1-score   support

           0       0.57      0.99      0.72     38104
           1       0.98      0.24      0.39     38118

    accuracy                           0.62     76222
   macro avg       0.77      0.62      0.55     76222
weighted avg       0.77      0.62      0.55     76222



In [146]:
print(y_train.value_counts())


Response
0    267553
1     37334
Name: count, dtype: int64


In [147]:
with open('model_logistic_reg_balance.pkl','wb') as file:
    pickle.dump(model, file)