In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv("insurance_pred/train.csv")
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


### Exploratory Data Analysis

In [3]:
df.isnull()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
381104,False,False,False,False,False,False,False,False,False,False,False,False
381105,False,False,False,False,False,False,False,False,False,False,False,False
381106,False,False,False,False,False,False,False,False,False,False,False,False
381107,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
df.Response.value_counts(normalize=True)

0    0.877437
1    0.122563
Name: Response, dtype: float64

### Feature Engineering

In [4]:
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 2})


In [5]:
df.Vehicle_Age.unique()
df['Vehicle_Age'] = df['Vehicle_Age'].map({'> 2 Years': 1, '1-2 Year': 2, '< 1 Year': 3})


In [6]:
df.Vehicle_Damage.unique()
df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 2})


In [8]:
X = df.drop("Response", axis=1)
X = df.drop("id", axis=1)
X.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,44,1,28.0,0,1,1,40454.0,26.0,217,1
1,1,76,1,3.0,0,2,2,33536.0,26.0,183,0
2,1,47,1,28.0,0,1,1,38294.0,26.0,27,1
3,1,21,1,11.0,1,3,2,28619.0,152.0,203,0
4,2,29,1,41.0,1,3,2,27496.0,152.0,39,0


In [9]:
# y data
y = df["Response"]
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Response, dtype: int64

### Training Model

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=1)

In [11]:
X_train

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
45748,2,22,1,24.0,1,3,2,2630.0,152.0,68,0
150378,1,22,1,46.0,1,3,2,29544.0,152.0,202,0
27587,2,61,1,50.0,0,2,1,34217.0,124.0,122,0
282194,2,29,1,32.0,1,3,1,33889.0,152.0,131,0
45560,1,23,1,25.0,0,3,1,21758.0,152.0,144,0
...,...,...,...,...,...,...,...,...,...,...,...
73349,2,27,1,11.0,1,3,2,26584.0,152.0,295,0
371403,2,39,1,41.0,0,2,1,27136.0,156.0,233,0
312201,1,45,1,28.0,1,2,2,33735.0,124.0,120,0
267336,2,43,1,36.0,1,2,2,35142.0,152.0,108,0


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train = scaler.fit_transform(X_train)
df_val = scaler.transform(X_val)
df_test = scaler.transform(X_test)

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(X_train, y_train)

In [14]:
LogisticRegressionScore = lr.score(X_val, y_val)
print("Accuracy obtained by Logistic Regression model:",LogisticRegressionScore*100)

Accuracy obtained by Logistic Regression model: 87.70827320196268


In [15]:
LogisticRegressionScore = lr.score(X_test, y_test)
print("Accuracy obtained by Logistic Regression model:",LogisticRegressionScore*100)

Accuracy obtained by Logistic Regression model: 87.7987182094291


In [16]:
LogisticRegressionScore = lr.score(X_train, y_train)
print("Accuracy obtained by Logistic Regression model:",LogisticRegressionScore*100)

Accuracy obtained by Logistic Regression model: 87.75349555737043


### Saving the model

In [17]:
import pickle

In [18]:
c = 1.0

In [19]:
output_file = 'model_C=%s.bin' % c

In [20]:
output_file

'model_C=1.0.bin'

In [21]:
f_out = open(output_file, 'wb')
pickle.dump((scaler,lr), f_out)
f_out.close()

In [22]:
with open(output_file, 'wb') as f_out:
    pickle.dump((scaler,lr), f_out)

### Loading the model

In [4]:
import pickle
import numpy as np
import pandas as pd

In [5]:
model_file = 'model_C=1.0.bin'

In [6]:
with open(model_file, 'rb') as f_in:
   scaler, model = pickle.load(f_in)

In [7]:
scaler, model

(StandardScaler(), LogisticRegression())

In [8]:
    #patient = np.array(X_test.iloc[0])
    #patient = patient.reshape(1, -1)
#[1].to_dict()
# patient = [[1.28938e+05, 1.00000e+00, 3.50000e+01, 1.00000e+00, 4.70000e+01,
#         0.00000e+00, 2.00000e+00, 1.00000e+00, 3.07980e+04, 1.52000e+02,
#         2.38000e+02]]
patient_dict ={'Gender': 1.0,
 'Age': 35.0,
 'Driving_License': 1.0,
 'Region_Code': 47.0,
 'Previously_Insured': 0.0,
 'Vehicle_Age': 2.0,
 'Vehicle_Damage': 1.0,
 'Annual_Premium': 30798.0,
 'Policy_Sales_Channel': 152.0,
 'Vintage': 238.0,
 'Response': 0.0}

patient = pd.DataFrame.from_records(patient_dict, index=[0])

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
xyz = scaler.fit_transform(patient)

In [11]:
abc =model.predict(xyz)
abc[0]



0

In [12]:
X_test.iloc[0].to_dict()

NameError: name 'X_test' is not defined

In [None]:
patient ={'id': 128938.0,
 'Gender': 1.0,
 'Age': 35.0,
 'Driving_License': 1.0,
 'Region_Code': 47.0,
 'Previously_Insured': 0.0,
 'Vehicle_Age': 2.0,
 'Vehicle_Damage': 1.0,
 'Annual_Premium': 30798.0,
 'Policy_Sales_Channel': 152.0,
 'Vintage': 238.0}

In [None]:
y_test