In [135]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Models import
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

label_encoder = LabelEncoder()

In [136]:
train_data = pd.read_csv("Data/Train_Claim_Master.csv", na_values=["?", "MISSEDDATA"])

train_data.head()

Unnamed: 0,CustomerID,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,IncidentTime,...,DateOfPolicyCoverage,InsurancePolicyState,Policy_CombinedSingleLimit,Policy_Deductible,PolicyAnnualPremium,UmbrellaLimit,InsuredRelationship,VehicleAttribute,VehicleAttributeDetails,ReportedFraud
0,Cust10000,03-02-2015,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City1,Location 1311,17,...,36093,State1,100/300,1000,1632.73,0,not-in-family,VehicleID,Vehicle26917,N
1,Cust10001,02-02-2015,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City5,Location 1311,10,...,36845,State1,100/300,1000,1255.19,0,not-in-family,VehicleYOM,2006,N
2,Cust10002,15-01-2015,Single Vehicle Collision,Side Collision,Minor Damage,Other,State8,City6,Location 2081,22,...,36934,State3,500/1000,617,1373.38,0,wife,VehicleModel,Jetta,N
3,Cust10003,19-01-2015,Single Vehicle Collision,Side Collision,Minor Damage,Other,State9,City6,Location 2081,22,...,38453,State2,500/1000,722,1337.6,0,own-child,VehicleID,Vehicle37363,N
4,Cust10004,09-01-2015,Single Vehicle Collision,Rear Collision,Minor Damage,Fire,State8,City6,Location 1695,10,...,35363,State2,100/300,500,1353.73,4279863,unmarried,VehicleID,Vehicle28633,N


In [137]:
def get_categorical_columns(df):
    categorical_columns = []
    threshold = 8000
    object_columns = df.select_dtypes(include=['object']).columns

    unique_values = df[object_columns].nunique()
    print(unique_values)
    for k, v in unique_values.items():
        if v <= threshold:
            categorical_columns.append(k)

    return categorical_columns

In [138]:
# Preprocessing

# Dropping rows with NA values. 
train_data = train_data.dropna()


# Removing "Cust" from CustomerId and "Location" from IncidentAddress.
train_data['CustomerID'] = train_data['CustomerID'].str.replace('Cust', '')
train_data['IncidentAddress'] = train_data['IncidentAddress'].str.replace('Location ', '').astype('int64').astype('int32')


# Converting dates to numeric values.
train_data['DateOfIncident'] = pd.to_datetime(train_data['DateOfIncident'], format='%d-%m-%Y')
train_data['DateOfIncident'] = train_data['DateOfIncident'].astype('int64').astype('int32')
train_data['DateOfPolicyCoverage'] = train_data['DateOfPolicyCoverage'].astype('int64').astype('int32')


In [139]:
# train_data["ReportedFraud"].value_counts(normalize=True)

In [140]:
# Splitting train and test data. 
x = train_data.drop('ReportedFraud', axis=1)
y = train_data['ReportedFraud']

# Perform one-hot encoding on the categorical columns
categorical_columns = get_categorical_columns(x)
x = pd.get_dummies(x, columns=categorical_columns)

y = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)


CustomerID                    9677
TypeOfIncident                   4
TypeOfCollission                 3
SeverityOfIncident               4
AuthoritiesContacted             4
IncidentState                    7
IncidentCity                     7
PropertyDamage                   2
Witnesses                        5
PoliceReport                     2
InsuredGender                    2
InsuredEducationLevel            7
InsuredOccupation               14
InsuredHobbies                  20
Country                          2
InsurancePolicyState             3
Policy_CombinedSingleLimit       9
InsuredRelationship              6
VehicleAttribute                 4
VehicleAttributeDetails       2454
dtype: int64


In [141]:
# Using Logistic Regression

lr = LogisticRegression()
lr.fit(x_train, y_train)
predlr = lr.predict(x_test)

acc_score = accuracy_score(y_test, predlr)
f1score = f1_score(y_test, predlr)
con_matrix = confusion_matrix(y_test, predlr)
cla_report = classification_report(y_test, predlr)

print(acc_score * 100)
print(f1score * 100)
print(con_matrix)
print(cla_report)

74.17355371900827
35.78767123287671
[[1945  138]
 [ 612  209]]
              precision    recall  f1-score   support

           0       0.76      0.93      0.84      2083
           1       0.60      0.25      0.36       821

    accuracy                           0.74      2904
   macro avg       0.68      0.59      0.60      2904
weighted avg       0.72      0.74      0.70      2904



In [142]:
# Using Decision Tree Classifier

dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
preddtc = dtc.predict(x_test)

acc_score = accuracy_score(y_test, preddtc)
f1score = f1_score(y_test, preddtc)
con_matrix = confusion_matrix(y_test, preddtc)
cla_report = classification_report(y_test, preddtc)

print(acc_score * 100)
print(f1score * 100)
print(con_matrix)
print(cla_report)

87.5
77.98665858095816
[[1898  185]
 [ 178  643]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      2083
           1       0.78      0.78      0.78       821

    accuracy                           0.88      2904
   macro avg       0.85      0.85      0.85      2904
weighted avg       0.88      0.88      0.88      2904



In [143]:
# Using Random Forest Classifier.

rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
predrfc = rfc.predict(x_test)

acc_score = accuracy_score(y_test, predrfc)
f1score = f1_score(y_test, predrfc)
con_matrix = confusion_matrix(y_test, predrfc)
cla_report = classification_report(y_test, predrfc)
auc = roc_auc_score(y_test, predrfc)
fpr, tpr, thresholds = roc_curve(y_test, predrfc)

print(acc_score * 100)
print(f1score * 100)
print(con_matrix)
print(cla_report)
print(auc)

91.73553719008265
84.08488063660477
[[2030   53]
 [ 187  634]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      2083
           1       0.92      0.77      0.84       821

    accuracy                           0.92      2904
   macro avg       0.92      0.87      0.89      2904
weighted avg       0.92      0.92      0.91      2904

0.8733924589931953
