In [1]:
#!python3

# load library
import time # menghitung waktu
import pickle # tipe data menyimpan model

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier # algoritma baru
from sklearn.model_selection import train_test_split # proses belajar
from sklearn.metrics import classification_report # hasil belajar

In [2]:
# load data
data = pd.read_csv('Prediction Insurance.csv')
data.head(2)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0


In [3]:
data.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [4]:
data['Region_Code'].unique()

array([28.,  3., 11., 41., 33.,  6., 35., 50., 15., 45.,  8., 36., 30.,
       26., 16., 47., 48., 19., 39., 23., 37.,  5., 17.,  2.,  7., 29.,
       46., 27., 25., 13., 18., 20., 49., 22., 44.,  0.,  9., 31., 12.,
       34., 21., 10., 14., 38., 24., 40., 43., 32.,  4., 51., 42.,  1.,
       52.])

In [5]:
data.shape

(381109, 12)

In [6]:
# data preprocessing
data = data.drop('id', axis=1)

data['Gender'] = data['Gender'].map({'Male':1, 'Female':0})
df_region = pd.get_dummies(data['Region_Code'])

df = data[['Gender','Age','Driving_License','Response']].merge(df_region, left_index=True, right_index=True)
df.head(5)

Unnamed: 0,Gender,Age,Driving_License,Response,0.0,1.0,2.0,3.0,4.0,5.0,...,43.0,44.0,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0
0,1,44,1,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,76,1,0,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,47,1,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,21,1,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,29,1,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# pastikan semua nama kolom adalah string
df.columns = df.columns.astype(str)

# data modelling
x = df.drop('Response', axis=1)
y = df['Response']

# proses latih belajar
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0) 
start = time.time()
model = RandomForestClassifier(n_estimators=100, random_state=0) # Algoritma baru: Random Forest

# hasil belajar
model.fit(x_train, y_train)
stop = time.time()
print(f"Training Time {stop-start} Sekon.....")

Training Time 79.88442349433899 Sekon.....


In [9]:
# model export
with open('modelRandomForest.pkl','wb') as file:
    pickle.dump(model, file)

In [10]:
# model evaluation
y_predict = model.predict(x_test)
print(classification_report(y_predict, y_test))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93     76107
           1       0.00      0.10      0.00       115

    accuracy                           0.88     76222
   macro avg       0.50      0.49      0.47     76222
weighted avg       1.00      0.88      0.93     76222

