In [1]:
#import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#load data
df = pd.read_csv("training data.csv")
df.head()

Unnamed: 0,claim_number,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,...,liab_prct,channel,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud
0,1,46,M,1.0,85,38301,1,1,Rent,80006,...,74,Broker,0,7530.940993,9.0,Compact,12885.45235,white,16161.33381,0
1,3,21,F,0.0,75,30445,0,1,Rent,15021,...,79,Online,0,2966.024895,4.0,Large,29429.45218,white,28691.96422,0
2,4,49,F,0.0,87,38923,0,1,Own,20158,...,0,Broker,0,6283.888333,3.0,Compact,21701.18195,white,22090.94758,1
3,5,58,F,1.0,58,40605,1,0,Own,15024,...,99,Broker,1,6169.747994,4.0,Medium,13198.27344,other,38329.58106,1
4,6,38,M,1.0,95,36380,1,0,Rent,50034,...,7,Broker,0,4541.38715,7.0,Medium,38060.21122,gray,25876.56319,0


In [3]:
#check for missing values
df.isna().sum()

claim_number                 0
age_of_driver                0
gender                       0
marital_status               5
safty_rating                 0
annual_income                0
high_education_ind           0
address_change_ind           0
living_status                0
zip_code                     0
claim_date                   0
claim_day_of_week            0
accident_site                0
past_num_of_claims           0
witness_present_ind        132
liab_prct                    0
channel                      0
policy_report_filed_ind      0
claim_est_payout            17
age_of_vehicle               8
vehicle_category             0
vehicle_price                0
vehicle_color                0
vehicle_weight               0
fraud                        0
dtype: int64

In [4]:
df.claim_est_payout = df.claim_est_payout.fillna(0)
df.witness_present_ind = df.witness_present_ind.fillna(0)
df.age_of_vehicle = df.age_of_vehicle.fillna(0)
df.marital_status = df.marital_status.fillna(2) #2 signifying unknown

df.head()

Unnamed: 0,claim_number,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,...,liab_prct,channel,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud
0,1,46,M,1.0,85,38301,1,1,Rent,80006,...,74,Broker,0,7530.940993,9.0,Compact,12885.45235,white,16161.33381,0
1,3,21,F,0.0,75,30445,0,1,Rent,15021,...,79,Online,0,2966.024895,4.0,Large,29429.45218,white,28691.96422,0
2,4,49,F,0.0,87,38923,0,1,Own,20158,...,0,Broker,0,6283.888333,3.0,Compact,21701.18195,white,22090.94758,1
3,5,58,F,1.0,58,40605,1,0,Own,15024,...,99,Broker,1,6169.747994,4.0,Medium,13198.27344,other,38329.58106,1
4,6,38,M,1.0,95,36380,1,0,Rent,50034,...,7,Broker,0,4541.38715,7.0,Medium,38060.21122,gray,25876.56319,0


In [5]:
#check data types
df.dtypes

claim_number                 int64
age_of_driver                int64
gender                      object
marital_status             float64
safty_rating                 int64
annual_income                int64
high_education_ind           int64
address_change_ind           int64
living_status               object
zip_code                     int64
claim_date                  object
claim_day_of_week           object
accident_site               object
past_num_of_claims           int64
witness_present_ind        float64
liab_prct                    int64
channel                     object
policy_report_filed_ind      int64
claim_est_payout           float64
age_of_vehicle             float64
vehicle_category            object
vehicle_price              float64
vehicle_color               object
vehicle_weight             float64
fraud                        int64
dtype: object

In [6]:
df.columns

Index(['claim_number', 'age_of_driver', 'gender', 'marital_status',
       'safty_rating', 'annual_income', 'high_education_ind',
       'address_change_ind', 'living_status', 'zip_code', 'claim_date',
       'claim_day_of_week', 'accident_site', 'past_num_of_claims',
       'witness_present_ind', 'liab_prct', 'channel',
       'policy_report_filed_ind', 'claim_est_payout', 'age_of_vehicle',
       'vehicle_category', 'vehicle_price', 'vehicle_color', 'vehicle_weight',
       'fraud'],
      dtype='object')

In [7]:
#convert claim_date column from object type to datetime
df.claim_date = pd.to_datetime(df.claim_date,dayfirst=False)

In [8]:
#convert claim_date into different date, month, year columns
df[['month']] = pd.DataFrame(df.claim_date.dt.month)
df[['day']] = pd.DataFrame(df.claim_date.dt.day)
df[['year']] = pd.DataFrame(df.claim_date.dt.year)

In [9]:
#remove unnecessary claim_date column
df = df.drop(columns = ['claim_date'])
df.head()

Unnamed: 0,claim_number,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,...,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud,month,day,year
0,1,46,M,1.0,85,38301,1,1,Rent,80006,...,7530.940993,9.0,Compact,12885.45235,white,16161.33381,0,12,16,2016
1,3,21,F,0.0,75,30445,0,1,Rent,15021,...,2966.024895,4.0,Large,29429.45218,white,28691.96422,0,2,12,2015
2,4,49,F,0.0,87,38923,0,1,Own,20158,...,6283.888333,3.0,Compact,21701.18195,white,22090.94758,1,12,6,2016
3,5,58,F,1.0,58,40605,1,0,Own,15024,...,6169.747994,4.0,Medium,13198.27344,other,38329.58106,1,5,5,2016
4,6,38,M,1.0,95,36380,1,0,Rent,50034,...,4541.38715,7.0,Medium,38060.21122,gray,25876.56319,0,10,27,2015


In [10]:
#set claim_number column as index
df.set_index("claim_number", inplace=True)
df.head()

Unnamed: 0_level_0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_day_of_week,...,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud,month,day,year
claim_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,46,M,1.0,85,38301,1,1,Rent,80006,Friday,...,7530.940993,9.0,Compact,12885.45235,white,16161.33381,0,12,16,2016
3,21,F,0.0,75,30445,0,1,Rent,15021,Thursday,...,2966.024895,4.0,Large,29429.45218,white,28691.96422,0,2,12,2015
4,49,F,0.0,87,38923,0,1,Own,20158,Tuesday,...,6283.888333,3.0,Compact,21701.18195,white,22090.94758,1,12,6,2016
5,58,F,1.0,58,40605,1,0,Own,15024,Thursday,...,6169.747994,4.0,Medium,13198.27344,other,38329.58106,1,5,5,2016
6,38,M,1.0,95,36380,1,0,Rent,50034,Tuesday,...,4541.38715,7.0,Medium,38060.21122,gray,25876.56319,0,10,27,2015


In [11]:
#save preprocessed dataset as csv file
df.to_csv("fraud.csv")