In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import time

In [2]:
# Import training and testing datasets
trainfile = r'Insurance Fraud - TRAIN-3000.csv'
trainData = pd.read_csv(trainfile)
testfile = r'Insurance Fraud -TEST-12900.csv'
testData = pd.read_csv(testfile)

# Dimensions of the training dataset
print(trainData.shape)

print(trainData.head())

(2999, 32)
  MONTH  WEEKOFMONTH  DAYOFWEEK     MAKE ACCIDENTAREA DAYOFWEEKCLAIMED  \
0   Jul            3     Sunday    Honda        Rural        Wednesday   
1   Nov            5     Monday    Mazda        Urban        Wednesday   
2   Jan            1     Monday  Pontiac        Urban        Wednesday   
3   Dec            1     Monday   Toyota        Rural          Tuesday   
4   Dec            5  Wednesday  Pontiac        Urban        Wednesday   

  MONTHCLAIMED  WEEKOFMONTHCLAIMED   SEX MARITALSTATUS  ...  \
0          Jan                   4  Male       Married  ...   
1          Dec                   1  Male        Single  ...   
2          Jan                   1  Male       Married  ...   
3          May                   3  Male       Married  ...   
4          Jan                   1  Male        Single  ...   

   AGEOFPOLICYHOLDER POLICEREPORTFILED WITNESSPRESENT AGENTTYPE  \
0           26_to_30                No             No  External   
1            over_65           

In [3]:
# Dimensions of the testing dataset
print(testData.shape)
print(testData.head())

(12918, 32)
  MONTH  WEEKOFMONTH  DAYOFWEEK     MAKE ACCIDENTAREA DAYOFWEEKCLAIMED  \
0   Jul            3     Sunday    Honda        Rural        Wednesday   
1   Nov            5     Monday    Mazda        Urban        Wednesday   
2   Jan            1     Monday  Pontiac        Urban        Wednesday   
3   Dec            1     Monday   Toyota        Rural          Tuesday   
4   Dec            5  Wednesday  Pontiac        Urban        Wednesday   

  MONTHCLAIMED  WEEKOFMONTHCLAIMED   SEX MARITALSTATUS  ...  \
0          Jan                   4  Male       Married  ...   
1          Dec                   1  Male        Single  ...   
2          Jan                   1  Male       Married  ...   
3          May                   3  Male       Married  ...   
4          Jan                   1  Male        Single  ...   

   AGEOFPOLICYHOLDER POLICEREPORTFILED WITNESSPRESENT AGENTTYPE  \
0           26_to_30                No             No  External   
1            over_65          

In [4]:
# Checking for null values
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MONTH                 2999 non-null   object
 1   WEEKOFMONTH           2999 non-null   int64 
 2   DAYOFWEEK             2999 non-null   object
 3   MAKE                  2999 non-null   object
 4   ACCIDENTAREA          2999 non-null   object
 5   DAYOFWEEKCLAIMED      2999 non-null   object
 6   MONTHCLAIMED          2999 non-null   object
 7   WEEKOFMONTHCLAIMED    2999 non-null   int64 
 8   SEX                   2999 non-null   object
 9   MARITALSTATUS         2999 non-null   object
 10  AGE                   2999 non-null   int64 
 11  FAULT                 2999 non-null   object
 12  POLICYTYPE            2999 non-null   object
 13  VEHICLECATEGORY       2999 non-null   object
 14  VEHICLEPRICE          2999 non-null   object
 15  REPNUMBER             2999 non-null   

In [5]:
trainData.FRAUDFOUND.value_counts()

No     2600
Yes     399
Name: FRAUDFOUND, dtype: int64

In [6]:
# Copying train and test data without the target variable
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.iloc[:, :-1].copy()

In [7]:
# List of all categorical features
categorical_features = []
for col in trainData_Copy.columns:
  if col in testData_Copy.columns:
    categorical_features.append(col)

In [8]:
# Removing age from categorical features as it provides no useful information
categorical_features.remove("AGE")
print (categorical_features)

['MONTH', 'WEEKOFMONTH', 'DAYOFWEEK', 'MAKE', 'ACCIDENTAREA', 'DAYOFWEEKCLAIMED', 'MONTHCLAIMED', 'WEEKOFMONTHCLAIMED', 'SEX', 'MARITALSTATUS', 'FAULT', 'POLICYTYPE', 'VEHICLECATEGORY', 'VEHICLEPRICE', 'REPNUMBER', 'DEDUCTIBLE', 'DRIVERRATING', 'DAYS_POLICY_ACCIDENT', 'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS', 'AGEOFVEHICLE', 'AGEOFPOLICYHOLDER', 'POLICEREPORTFILED', 'WITNESSPRESENT', 'AGENTTYPE', 'NUMBEROFSUPPLIMENTS', 'ADDRESSCHANGE_CLAIM', 'NUMBEROFCARS', 'YEAR', 'BASEPOLICY']


In [9]:
# Combining train and test dataset in order to One Hot Encode categorical features
combined_data = pd.concat([trainData_Copy,testData_Copy], keys=[0,1])

# Performing one hot encoding by introducing dummies for categorical features
combined_data = pd.get_dummies(combined_data,columns=categorical_features)

# Separating train data and test data
X_train = combined_data.xs(0)
X_test = combined_data.xs(1)

print('----------------------------------------------------------------')
print(X_train.head())
print(X_test.head())

----------------------------------------------------------------
   AGE  MONTH_Apr  MONTH_Aug  MONTH_Dec  MONTH_Feb  MONTH_Jan  MONTH_Jul  \
0   21          0          0          0          0          0          1   
1   68          0          0          0          0          0          0   
2   50          0          0          0          0          1          0   
3   39          0          0          1          0          0          0   
4   43          0          0          1          0          0          0   

   MONTH_Jun  MONTH_Mar  MONTH_May  ...  NUMBEROFCARS_2-vehicles  \
0          0          0          0  ...                        0   
1          0          0          0  ...                        0   
2          0          0          0  ...                        0   
3          0          0          0  ...                        1   
4          0          0          0  ...                        0   

   NUMBEROFCARS_3_to_4  NUMBEROFCARS_5_to_8  NUMBEROFCARS_more_than_8

In [10]:
print("shape1",X_train.shape)
print("shape2",X_test.shape)

shape1 (2999, 175)
shape2 (12918, 175)


In [11]:
# Getting the target column for training and testing
y_train = trainData.iloc[:,-1]
print(y_train.head())
y_test = testData.iloc[:,-1]
print(y_test.head())

0    Yes
1    Yes
2    Yes
3    Yes
4    Yes
Name: FRAUDFOUND, dtype: object
0    Yes
1    Yes
2    Yes
3    Yes
4    Yes
Name: FRAUDFOUND, dtype: object


In [12]:
# Baseline Decision Tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("Accuracy score (testing) - Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("Classification Report")
print(classification_report(y_test,clf_predict))

Accuracy score (testing) - Decision Tree:0.886825
Confusion Matrix for Decision Tree
[[11016  1404]
 [   58   440]]
Classification Report
              precision    recall  f1-score   support

          No       0.99      0.89      0.94     12420
         Yes       0.24      0.88      0.38       498

    accuracy                           0.89     12918
   macro avg       0.62      0.89      0.66     12918
weighted avg       0.97      0.89      0.92     12918



In [13]:
# Random Search for Decision Tree
start_time = time.time()
print("Random SearchCV - Decision Tree")
parameters={'min_samples_leaf' : range(5,50,5),
            'max_depth': range(5,30,5),
            'criterion':['gini','entropy'],
            'max_features':range(5,20)
            }
clf_random = RandomizedSearchCV(clf,parameters,n_iter=10,cv=5)
clf_random.fit(X_train, y_train)
grid_param=clf_random.best_params_
print(grid_param)
print("--- %s seconds ---" % (time.time() - start_time))

Random SearchCV - Decision Tree
{'min_samples_leaf': 15, 'max_features': 15, 'max_depth': 20, 'criterion': 'entropy'}
--- 0.4526519775390625 seconds ---


In [14]:
# Grid Search for Decision Tree
start_time = time.time()
print("GridSearchCV - Decision Tree")
clf_grid = GridSearchCV(clf,parameters,cv=5)
clf_grid.fit(X_train, y_train)
grid_param_1=clf_grid.best_params_
print(grid_param_1)
print("--- %s seconds ---" % (time.time() - start_time))

GridSearchCV - Decision Tree
{'criterion': 'gini', 'max_depth': 20, 'max_features': 13, 'min_samples_leaf': 20}
--- 55.52418780326843 seconds ---


In [15]:
# Using parameters obtained from Hyperparameter Tuning in the Decision Tree Classifier 
clf = DecisionTreeClassifier(**grid_param)
clf1 = DecisionTreeClassifier(**grid_param_1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clf1.fit(X_train,y_train)
clf1_predict = clf1.predict(X_test)

# Accuracy, confusion matrix and classification report
print("Accuracy (testing) after random search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Accuracy (testing) after grid search for Decision Tree:{0:6f}".format(clf1.score(X_test,y_test)))
print("\n")
print("Confusion Matrix after Random Search for Decision Tree (Testing)")
print(confusion_matrix(y_test,clf_predict))
print("Classification Report for Random Search")
print(classification_report(y_test,clf_predict))
print('\n')
print("Confusion Matrix after Grid Search for Decision Tree (Testing)")
print(confusion_matrix(y_test,clf1_predict))
print("Classification Report for Grid Search (Testing)")
print(classification_report(y_test,clf1_predict))

clf_cv_score = cross_val_score(clf, X_train, y_train, cv=5, scoring="balanced_accuracy")
print(clf_cv_score)
print("\n")

Accuracy (testing) after random search for Decision Tree:0.872271
Accuracy (testing) after grid search for Decision Tree:0.898127


Confusion Matrix after Random Search for Decision Tree (Testing)
[[11086  1334]
 [  316   182]]
Classification Report for Random Search
              precision    recall  f1-score   support

          No       0.97      0.89      0.93     12420
         Yes       0.12      0.37      0.18       498

    accuracy                           0.87     12918
   macro avg       0.55      0.63      0.56     12918
weighted avg       0.94      0.87      0.90     12918



Confusion Matrix after Grid Search for Decision Tree (Testing)
[[11446   974]
 [  342   156]]
Classification Report for Grid Search (Testing)
              precision    recall  f1-score   support

          No       0.97      0.92      0.95     12420
         Yes       0.14      0.31      0.19       498

    accuracy                           0.90     12918
   macro avg       0.55      0.62      0.57

In [16]:
# Baseline Random Forest model
rand_param={'min_samples_leaf' : range(5,25),
                 'max_depth': range(2,20,2),
                 'max_features':range(5,15),
                 'n_estimators':[10,15,20]}
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("Accuracy Score - Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Random Forest")
print(confusion_matrix(y_test,rfc_predict))
print("Classification Report for Random Forest")
print(classification_report(y_test,rfc_predict))

Accuracy Score - Random Forest:0.950844
Confusion Matrix for Random Forest
[[11864   556]
 [   79   419]]
Classification Report for Random Forest
              precision    recall  f1-score   support

          No       0.99      0.96      0.97     12420
         Yes       0.43      0.84      0.57       498

    accuracy                           0.95     12918
   macro avg       0.71      0.90      0.77     12918
weighted avg       0.97      0.95      0.96     12918



In [17]:
# Random Search for Random Forest
import time
start_time = time.time()
rfc_random = RandomizedSearchCV(rfc,rand_param,n_iter=10,cv=5)
rfc_random.fit(X_train, y_train)
rand_param_rfc=rfc_random.best_params_
print(rand_param_rfc)
rfc= RandomForestClassifier(**rand_param_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print('\n')
print("--- %s seconds ---" % (time.time() - start_time))

{'n_estimators': 20, 'min_samples_leaf': 7, 'max_features': 14, 'max_depth': 10}


--- 1.4717299938201904 seconds ---


In [18]:
# Grid Search for Random Forest
import time
start_time = time.time()
rfc_grid = GridSearchCV(rfc,rand_param,cv=5)
rfc_grid.fit(X_train, y_train)
grid_param_rfc=rfc_grid.best_params_
print(grid_param_rfc)
rfc1= RandomForestClassifier(**grid_param_rfc)
rfc1.fit(X_train,y_train)
rfc1_predict = rfc1.predict(X_test)
print('\n')
print("--- %s seconds ---" % (time.time() - start_time))

{'max_depth': 14, 'max_features': 13, 'min_samples_leaf': 5, 'n_estimators': 15}


--- 724.8738648891449 seconds ---


In [19]:
# Accuracy, confusion matrix and classification report
print("Accuracy (testing) after Random Search for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix after Random Search for Random Forest")
print(confusion_matrix(y_test,rfc_predict))
print("Classification Report after Grid Search for Random Forest")
print(classification_report(y_test,rfc_predict))

print("Accuracy (testing) after Grid Search for Random Forest:{0:6f}".format(rfc1.score(X_test,y_test)))
print("Confusion Matrix after Grid Search for Random Forest")
print(confusion_matrix(y_test,rfc1_predict))
print("Classification Report after Grid Search for Random Forest")
print(classification_report(y_test,rfc1_predict))
rfc1_cv_score = cross_val_score(rfc1, X_train, y_train)
print(rfc1_cv_score)

Accuracy (testing) after Random Search for Random Forest:0.947747
Confusion Matrix after Random Search for Random Forest
[[12152   268]
 [  407    91]]
Classification Report after Grid Search for Random Forest
              precision    recall  f1-score   support

          No       0.97      0.98      0.97     12420
         Yes       0.25      0.18      0.21       498

    accuracy                           0.95     12918
   macro avg       0.61      0.58      0.59     12918
weighted avg       0.94      0.95      0.94     12918

Accuracy (testing) after Grid Search for Random Forest:0.943877
Confusion Matrix after Grid Search for Random Forest
[[12065   355]
 [  370   128]]
Classification Report after Grid Search for Random Forest
              precision    recall  f1-score   support

          No       0.97      0.97      0.97     12420
         Yes       0.27      0.26      0.26       498

    accuracy                           0.94     12918
   macro avg       0.62      0.61      