In [54]:
import time
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [55]:
# Load Data
data = pd.read_csv('Prediction Insurance.csv')
print(data.head())  # Memeriksa beberapa baris pertama dari data


   id  Gender  Age  Driving_License  Region_Code  Previously_Insured  \
0   1    Male   44                1           28                   0   
1   2    Male   76                1            3                   0   
2   3    Male   47                1           28                   0   
3   4    Male   21                1           11                   1   
4   5  Female   29                1           41                   1   

  Vehicle_Age Vehicle_Damage  Annual_Premium  Policy_Sales_Channel  Vintage  \
0   > 2 Years            Yes           40454                    26      217   
1    1-2 Year             No           33536                    26      183   
2   > 2 Years            Yes           38294                    26       27   
3    < 1 Year             No           28619                   152      203   
4    < 1 Year             No           27496                   152       39   

   Response  
0         1  
1         0  
2         1  
3         0  
4         0  


In [56]:
# Data Preprocessing
data = pd.get_dummies(data, columns=['Gender', 'Vehicle_Age', 'Vehicle_Damage'], drop_first=True)
print(data.head())  # Memeriksa hasil preprocessing

# Define features and target
X = data.drop(['id', 'Response'], axis=1)
y = data['Response']
print(X.head())  # Memeriksa fitur
print(y.head())  # Memeriksa target


   id  Age  Driving_License  Region_Code  Previously_Insured  Annual_Premium  \
0   1   44                1           28                   0           40454   
1   2   76                1            3                   0           33536   
2   3   47                1           28                   0           38294   
3   4   21                1           11                   1           28619   
4   5   29                1           41                   1           27496   

   Policy_Sales_Channel  Vintage  Response  Gender_Male  Vehicle_Age_< 1 Year  \
0                    26      217         1         True                 False   
1                    26      183         0         True                 False   
2                    26       27         1         True                 False   
3                   152      203         0         True                  True   
4                   152       39         0        False                  True   

   Vehicle_Age_> 2 Years  Vehicl

In [57]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)  # Memeriksa ukuran data latih dan uji


(304887, 11) (76222, 11) (304887,) (76222,)


In [58]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train[:5])  # Memeriksa hasil standarisasi pada data latih
print(X_test[:5])   # Memeriksa hasil standarisasi pada data uji


[[ 0.01060114  0.04664784 -0.86346971 -0.91993989  1.29441692 -1.05046307
   0.86950541 -1.0847756  -0.87180455 -0.20979889  0.99025045]
 [-0.05387549  0.04664784 -1.16608742 -0.91993989 -0.4366383  -1.58522262
  -1.49771923  0.92184964 -0.87180455 -0.20979889  0.99025045]
 [-1.0855015   0.04664784  0.27134671  1.08702755  0.85464034  0.73821545
   0.14020893  0.92184964  1.14704608 -0.20979889 -1.00984554]
 [-1.02102488  0.04664784  1.10354541  1.08702755 -0.08344925  0.71977547
   1.46728941 -1.0847756   1.14704608 -0.20979889 -1.00984554]
 [ 1.10670378  0.04664784  1.6331264  -0.91993989 -1.61942171  0.77509542
   0.00869645  0.92184964 -0.87180455  4.76646949  0.99025045]]
[[-1.14997813  0.04664784 -1.77132284  1.08702755 -0.58906482  0.88573533
  -0.98362499 -1.0847756   1.14704608 -0.20979889 -1.00984554]
 [ 1.04222715  0.04664784 -0.86346971 -0.91993989  0.40141777 -1.58522262
  -0.62495459  0.92184964 -0.87180455 -0.20979889  0.99025045]
 [ 0.13955439  0.04664784 -1.77132284 -0

In [59]:
# Model Training
start = time.time()
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
stop = time.time()

print(f"Training Time: {stop - start} seconds")


Training Time: 56.729031562805176 seconds


In [60]:
# Save Model
model_filename = 'insurance_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)
print(f"Model saved to {model_filename}")


Model saved to insurance_model.pkl


In [61]:
# Model Evaluation
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nAccuracy:", accuracy)


Confusion Matrix:
 [[64790  1909]
 [ 8376  1147]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93     66699
           1       0.38      0.12      0.18      9523

    accuracy                           0.87     76222
   macro avg       0.63      0.55      0.55     76222
weighted avg       0.82      0.87      0.83     76222


Accuracy: 0.8650652042717326


In [62]:
# Example predictions
example_data = pd.DataFrame(X_test[:5], columns=X.columns)
example_predictions = model.predict(example_data)

example_data['Predicted_Response'] = example_predictions
print(example_data)


        Age  Driving_License  Region_Code  Previously_Insured  Annual_Premium  \
0 -1.149978         0.046648    -1.771323            1.087028       -0.589065   
1  1.042227         0.046648    -0.863470           -0.919940        0.401418   
2  0.139554         0.046648    -1.771323           -0.919940       -1.619422   
3 -0.827595         0.046648    -1.166087           -0.919940       -1.619422   
4  0.784321         0.046648     1.027891           -0.919940        0.199032   

   Policy_Sales_Channel   Vintage  Gender_Male  Vehicle_Age_< 1 Year  \
0              0.885735 -0.983625    -1.084776              1.147046   
1             -1.585223 -0.624955     0.921850             -0.871805   
2             -1.585223 -0.768423    -1.084776             -0.871805   
3              0.719775  1.359688    -1.084776              1.147046   
4              0.221896  1.323821     0.921850             -0.871805   

   Vehicle_Age_> 2 Years  Vehicle_Damage_Yes  Predicted_Response  
0            

