<a href="https://colab.research.google.com/github/IndraniMandal/New-Revisions/blob/main/Fraud_dataset_cleanup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [None]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! mkdir ~/.kaggle

In [None]:
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download khusheekapoor/vehicle-insurance-fraud-detection

Downloading vehicle-insurance-fraud-detection.zip to /content
  0% 0.00/349k [00:00<?, ?B/s]
100% 349k/349k [00:00<00:00, 86.5MB/s]


In [None]:
!unzip /content/vehicle-insurance-fraud-detection.zip
!rm /content/vehicle-insurance-fraud-detection.zip

Archive:  /content/vehicle-insurance-fraud-detection.zip
  inflating: carclaims.csv           


Vehicle insurance fraud dataset used: [Vehicle Insurance Fraud Detection](https://www.kaggle.com/datasets/khusheekapoor/vehicle-insurance-fraud-detection)


In [None]:
fraud = pd.read_csv("/content/carclaims.csv")
fraud.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,No
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision,No
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability,No
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision,No


In [None]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  PolicyNumber          15420 non-null

# Reformatting the categorical data into numerical data


Formatting the months into values from 0-11 in their correct order.


In [None]:
fraud["Month"].unique()

array(['Dec', 'Jan', 'Oct', 'Jun', 'Feb', 'Nov', 'Apr', 'Mar', 'Aug',
       'Jul', 'May', 'Sep'], dtype=object)

In [None]:
fraud['Month'].replace(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],[0,1,2,3,4,5,6,7,8,9,10,11], inplace=True)

In [None]:
fraud["Month"].unique()

array([11,  0,  9,  5,  1, 10,  3,  2,  7,  6,  4,  8])

Formatting the days of the week from 0-6 starting from Sunday.

In [None]:
fraud["DayOfWeek"].unique()

array(['Wednesday', 'Friday', 'Saturday', 'Monday', 'Tuesday', 'Sunday',
       'Thursday'], dtype=object)

In [None]:
fraud["DayOfWeek"].replace(['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'],[0,1,2,3,4,5,6], inplace=True)

In [None]:
fraud["DayOfWeek"].unique()

array([3, 5, 6, 1, 2, 0, 4])

Formatting the car makes from 0-18 in no particular order. A key would be needed for further evaluation.

In [None]:
fraud["Make"].unique()

array(['Honda', 'Toyota', 'Ford', 'Mazda', 'Chevrolet', 'Pontiac',
       'Accura', 'Dodge', 'Mercury', 'Jaguar', 'Nisson', 'VW', 'Saab',
       'Saturn', 'Porche', 'BMW', 'Mecedes', 'Ferrari', 'Lexus'],
      dtype=object)

In [None]:
fraud["Make"].replace(['Honda', 'Toyota', 'Ford', 'Mazda', 'Chevrolet', 'Pontiac',
       'Accura', 'Dodge', 'Mercury', 'Jaguar', 'Nisson', 'VW', 'Saab',
       'Saturn', 'Porche', 'BMW', 'Mecedes', 'Ferrari', 'Lexus'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18],inplace=True)

In [None]:
fraud["Make"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18])

Formatted as simple binary values; Urban = 0, Rural 1

In [None]:
fraud["AccidentArea"].unique()

array(['Urban', 'Rural'], dtype=object)

In [None]:
fraud["AccidentArea"].replace(['Urban', 'Rural'],[0,1],inplace=True)

In [None]:
fraud["AccidentArea"].unique()

array([0, 1])

Formatting the days of the week from 0-7 starting from Sunday. 7 is the representation of a null value, or that it was never reported.

In [None]:
fraud["DayOfWeekClaimed"].unique()

array(['Tuesday', 'Monday', 'Thursday', 'Friday', 'Wednesday', 'Saturday',
       'Sunday', '0'], dtype=object)

In [None]:
fraud["DayOfWeekClaimed"].replace(['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','0'],[0,1,2,3,4,5,6,7], inplace=True)

In [None]:
fraud["DayOfWeekClaimed"].unique()

array([2, 1, 4, 5, 3, 6, 0, 7])

Formatting the months into values from 0-12 in their correct order. 12 is the representation of a null value, or that it was never reported.

In [None]:
fraud["MonthClaimed"].unique()

array(['Jan', 'Nov', 'Jul', 'Feb', 'Mar', 'Dec', 'Apr', 'Aug', 'May',
       'Jun', 'Sep', 'Oct', '0'], dtype=object)

In [None]:
fraud['MonthClaimed'].replace(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', '0'],[0,1,2,3,4,5,6,7,8,9,10,11,12], inplace=True)

In [None]:
fraud["MonthClaimed"].unique()

array([ 0, 10,  6,  1,  2, 11,  3,  7,  4,  5,  8,  9, 12])

Formatted as simple binary values; Male = 0, Female = 1

In [None]:
fraud["Sex"].unique()

array(['Female', 'Male'], dtype=object)

In [None]:
fraud["Sex"].replace(['Male','Female'],[0,1],inplace=True)

In [None]:
fraud["Sex"].unique()

array([1, 0])

Formatting current marital status from 0-3 sorted in no particular order. Would required key.

In [None]:
fraud["MaritalStatus"].unique()

array(['Single', 'Married', 'Widow', 'Divorced'], dtype=object)

In [None]:
fraud["MaritalStatus"].replace(['Single', 'Married', 'Widow', 'Divorced'],[0,1,2,3],inplace=True)

In [None]:
fraud["MaritalStatus"].unique()

array([0, 1, 2, 3])

Formated as simple binary values; Policy Holder = 0, Third Party = 1

In [None]:
fraud["Fault"].unique()

array(['Policy Holder', 'Third Party'], dtype=object)

In [None]:
fraud["Fault"].replace(['Policy Holder', 'Third Party'],[0,1],inplace=True);

In [None]:
fraud["Fault"].unique()

array([0, 1])

Formatting the policy type from 0-8. Sorted by make and category. 

Make order: Sedan, Utility, Sport. 

Category order: Liability, Collision, All Perils.

In [None]:
fraud["PolicyType"].unique()

array(['Sport - Liability', 'Sport - Collision', 'Sedan - Liability',
       'Utility - All Perils', 'Sedan - All Perils', 'Sedan - Collision',
       'Utility - Collision', 'Utility - Liability', 'Sport - All Perils'],
      dtype=object)

In [None]:
fraud["PolicyType"].replace([ 'Sedan - Liability', 'Sedan - Collision', 'Sedan - All Perils', 
                              'Utility - Liability', 'Utility - Collision', 'Utility - All Perils',
                              'Sport - Liability', 'Sport - Collision', 'Sport - All Perils'],[0,1,2,3,4,5,6,7,8],inplace=True)

In [None]:
fraud["PolicyType"].unique()

array([6, 7, 0, 5, 2, 1, 4, 3, 8])

Formatting the vehicle category from 0-2.
Order: Sedan, Utility, Sport

In [None]:
fraud["VehicleCategory"].unique()

array(['Sport', 'Utility', 'Sedan'], dtype=object)

In [None]:
fraud["VehicleCategory"].replace(['Sedan', 'Utility', 'Sport'],[0,1,2],inplace=True)

In [None]:
fraud["VehicleCategory"].unique()

array([2, 1, 0])

Formatting the vehicle price range from 0-5. Ordered from lowest to highest price range.

In [None]:
fraud["VehiclePrice"].unique()

array(['more than 69,000', '20,000 to 29,000', '30,000 to 39,000',
       'less than 20,000', '40,000 to 59,000', '60,000 to 69,000'],
      dtype=object)

In [None]:
fraud["VehiclePrice"].replace(['less than 20,000', '20,000 to 29,000', '30,000 to 39,000',
                               '40,000 to 59,000', '60,000 to 69,000', 'more than 69,000'],[0,1,2,3,4,5],inplace=True)

In [None]:
fraud["VehiclePrice"].unique()

array([5, 1, 2, 0, 3, 4])

In [None]:
fraud["Days:Policy-Accident"].unique()

array(['more than 30', '15 to 30', 'none', '1 to 7', '8 to 15'],
      dtype=object)

In [None]:
fraud["Days:Policy-Accident"].replace(['none', '1 to 7', '8 to 15',
                                       '15 to 30', 'more than 30'],[0,1,2,3,4],inplace=True)

In [None]:
fraud["Days:Policy-Accident"].unique()

array([4, 3, 0, 1, 2])

In [None]:
fraud["Days:Policy-Claim"].unique()

array(['more than 30', '15 to 30', '8 to 15', 'none'], dtype=object)

In [None]:
fraud["Days:Policy-Claim"].replace(['none', '8 to 15','15 to 30', 'more than 30'],[0,1,2,3],inplace=True)

In [None]:
fraud["Days:Policy-Claim"].unique()

array([3, 2, 1, 0])

In [None]:
fraud["PastNumberOfClaims"].unique()

array(['none', '1', '2 to 4', 'more than 4'], dtype=object)

In [None]:
fraud["PastNumberOfClaims"].replace(['none', '1', '2 to 4', 'more than 4'],[0,1,2,3],inplace=True)

In [None]:
fraud["PastNumberOfClaims"].unique()

array([0, 1, 2, 3])

In [None]:
fraud["AgeOfVehicle"].unique()

array(['3 years', '6 years', '7 years', 'more than 7', '5 years', 'new',
       '4 years', '2 years'], dtype=object)

In [None]:
fraud["AgeOfVehicle"].replace(['new', '2 years', '3 years',
                               '4 years', '5 years', '6 years',
                               '7 years', 'more than 7'],[0,1,2,3,4,5,6,7],inplace=True)

In [None]:
fraud["AgeOfVehicle"].unique()

array([2, 5, 6, 7, 4, 0, 3, 1])

In [None]:
fraud["AgeOfPolicyHolder"].unique()

array(['26 to 30', '31 to 35', '41 to 50', '51 to 65', '21 to 25',
       '36 to 40', '16 to 17', 'over 65', '18 to 20'], dtype=object)

In [None]:
fraud["AgeOfPolicyHolder"].replace(['16 to 17', '18 to 20', '21 to 25',
                                    '26 to 30', '31 to 35', '36 to 40',
                                    '41 to 50', '51 to 65', 'over 65'],[0,1,2,3,4,5,6,7,8],inplace=True)

In [None]:
fraud["AgeOfPolicyHolder"].unique()

array([3, 4, 6, 7, 2, 5, 0, 8, 1])

In [None]:
fraud["PoliceReportFiled"].unique()

array(['No', 'Yes'], dtype=object)

In [None]:
fraud["PoliceReportFiled"].replace(['No','Yes'],[0,1],inplace=True)

In [None]:
fraud["PoliceReportFiled"].unique()

array([0, 1])

In [None]:
fraud["WitnessPresent"].unique()

array(['No', 'Yes'], dtype=object)

In [None]:
fraud["WitnessPresent"].replace(['No','Yes'],[0,1],inplace=True)

In [None]:
fraud["WitnessPresent"].unique()

array([0, 1])

In [None]:
fraud["AgentType"].unique()

array(['External', 'Internal'], dtype=object)

In [None]:
fraud["AgentType"].replace(['External', 'Internal'],[0,1],inplace=True)

In [None]:
fraud["AgentType"].unique()

array([0, 1])

In [None]:
fraud["NumberOfSuppliments"].unique()

array(['none', 'more than 5', '3 to 5', '1 to 2'], dtype=object)

In [None]:
fraud["NumberOfSuppliments"].replace(['none', '1 to 2', '3 to 5', 'more than 5'],[0,1,2,3],inplace=True)

In [None]:
fraud["NumberOfSuppliments"].unique()

array([0, 3, 2, 1])

In [None]:
fraud["AddressChange-Claim"].unique()

array(['1 year', 'no change', '4 to 8 years', '2 to 3 years',
       'under 6 months'], dtype=object)

In [None]:
fraud["AddressChange-Claim"].replace(['no change', 'under 6 months', '1 year', '2 to 3 years', '4 to 8 years'],[0,1,2,3,4],inplace=True)

In [None]:
fraud["AddressChange-Claim"].unique()

array([2, 0, 4, 3, 1])

In [None]:
fraud["NumberOfCars"].unique()

array(['3 to 4', '1 vehicle', '2 vehicles', '5 to 8', 'more than 8'],
      dtype=object)

In [None]:
fraud["NumberOfCars"].replace(['1 vehicle', '2 vehicles', '3 to 4', '5 to 8', 'more than 8'],[0,1,2,3,4],inplace=True)

In [None]:
fraud["NumberOfCars"].unique()

array([2, 0, 1, 3, 4])

In [None]:
fraud["BasePolicy"].unique()

array(['Liability', 'Collision', 'All Perils'], dtype=object)

In [None]:
fraud["BasePolicy"].replace(['Liability', 'Collision', 'All Perils'],[0,1,2],inplace=True)

In [None]:
fraud["BasePolicy"].unique()

array([0, 1, 2])

In [None]:
fraud["FraudFound"].unique()

array(['No', 'Yes'], dtype=object)

In [None]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  int64 
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  int64 
 3   Make                  15420 non-null  int64 
 4   AccidentArea          15420 non-null  int64 
 5   DayOfWeekClaimed      15420 non-null  int64 
 6   MonthClaimed          15420 non-null  int64 
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  int64 
 9   MaritalStatus         15420 non-null  int64 
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  int64 
 12  PolicyType            15420 non-null  int64 
 13  VehicleCategory       15420 non-null  int64 
 14  VehiclePrice          15420 non-null  int64 
 15  PolicyNumber          15420 non-null

In [None]:
X  = fraud.drop(['FraudFound'],axis=1)
y = fraud['FraudFound']

model = tree.DecisionTreeClassifier(criterion='entropy')

model.fit(X,y)
scores = cross_val_score(model, X, y, cv=5)
print("Fold Accuracies: {}".format(scores))
print("Accuracy: {:3.2f}".format(scores.mean()))

Fold Accuracies: [0.29053178 0.76783398 0.76232166 0.8654345  0.93709468]
Accuracy: 0.72


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7, test_size=0.3,random_state=2)

In [None]:
model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=1)

model.fit(X_train, y_train)

y_train_model = model.predict(X_train)

y_test_model = model.predict(X_test)

In [None]:
print("Precision Accuracy: {:3.2f}".format(precision_score(y_test,y_test_model, average = 'weighted')))
print("Recall Accuracy: {:3.2f}".format(recall_score(y_test,y_test_model, average = 'weighted')))
print("Accuracy Accuracy: {:3.2f}".format(accuracy_score(y_test,y_test_model)))
print("F1 Score Accuracy: {:3.2f}".format(f1_score(y_test, y_test_model, average='micro')))

Precision Accuracy: 0.88
Recall Accuracy: 0.94
Accuracy Accuracy: 0.94
F1 Score Accuracy: 0.94


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
fraud.to_csv('cleanFraud.csv', encoding='utf-8', index=False)