In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import sklearn
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin

# Library configurations
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Storing the CSV files as variables
customers_withoutTarget_csv = "./Data/TrainData/Traindata_withoutTarget.csv"
customers_withTarget_csv = "./Data/TrainData/Traindata_with_Target.csv"
claims_csv = "./Data/TrainData/Train_Claim.csv"
demographics_csv = "./Data/TrainData/Train_Demographics.csv"
policy_csv = "./Data/TrainData/Train_Policy.csv"
vehicle_csv = "./Data/TrainData/Train_Vehicle.csv"

# Converting each of the CSV data to pandas dataframes
customers_no_target_df = pd.read_csv(customers_withoutTarget _csv)
customers_target_df = pd.read_csv(customers_withTarget_csv)
claims_df = pd.read_csv(claims_csv)
demographics_df = pd.read_csv(demographics_csv)
policy_df = pd.read_csv(policy_csv)
vehicle_df = pd.read_csv(vehicle_csv)

# Joining the data into one large dataframe for pre-processing
df1 = pd.merge(claims_df, demographics_df, on='CustomerID')
df2 = pd.merge(df1, policy_df, on='CustomerID')
full_df = pd.merge(df2, customers_target_df, on='CustomerID')

fraud = full_df["ReportedFraud"] == "Y"
not_fraud = full_df["ReportedFraud"] == "N"

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

y = full_df["ReportedFraud"]
X = full_df.drop("ReportedFraud", axis=1)

# Splitting data and labels, stratify the datasets using the labels
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [4]:
x_train.head()

Unnamed: 0,CustomerID,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,IncidentTime,NumberOfVehicles,PropertyDamage,BodilyInjuries,Witnesses,PoliceReport,AmountOfTotalClaim,AmountOfInjuryClaim,AmountOfPropertyClaim,AmountOfVehicleDamage,InsuredAge,InsuredZipCode,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss,Country,InsurancePolicyNumber,CustomerLoyaltyPeriod,DateOfPolicyCoverage,InsurancePolicyState,Policy_CombinedSingleLimit,Policy_Deductible,PolicyAnnualPremium,UmbrellaLimit,InsuredRelationship
20936,Cust35427,2015-02-27,Multi-vehicle Collision,Front Collision,Major Damage,Fire,State4,City4,Location 1756,4,3,?,2,1,YES,50565,10529,5901,34135,44,616714,MALE,Masters,prof-specialty,movies,0,0,India,144548,275,2002-10-07,State1,100/300,1000,1072.38,0,husband
19307,Cust33448,2015-02-08,Multi-vehicle Collision,Side Collision,Minor Damage,Other,State8,City7,Location 1025,19,3,?,2,0,?,53301,4978,4710,43613,40,444797,MALE,MD,protective-serv,sleeping,0,-40600,India,142569,227,1992-12-13,State2,100/300,1000,1441.49,0,own-child
2609,Cust13159,2015-01-17,Vehicle Theft,?,Trivial Damage,,State9,City2,Location 1137,8,1,?,2,2,NO,3922,347,693,2882,38,606144,MALE,Masters,transport-moving,video-games,0,-43400,India,122280,251,2004-04-22,State1,500/1000,1507,1046.0,0,husband
15420,Cust28705,2015-02-25,Multi-vehicle Collision,Side Collision,Total Loss,Other,State5,City7,Location 2077,4,2,?,2,0,NO,88690,14503,14503,59684,33,449260,MALE,JD,exec-managerial,paintball,45600,-61400,India,137826,144,1996-01-22,State1,250/500,500,1216.26,3026658,other-relative
21007,Cust35512,2015-01-10,Single Vehicle Collision,Side Collision,Minor Damage,Fire,State7,City4,Location 1479,14,1,NO,0,0,?,62271,6367,12735,43169,47,618455,FEMALE,MD,sales,kayaking,0,-55600,India,144633,284,1992-02-10,State3,100/300,1181,1493.96,0,not-in-family


In [5]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((23068, 37), (23068, 1), (5768, 37), (5768, 1))

In [6]:
# Customer tranformer to combine specific attributes from the dataset

class CombineAttributes(BaseEstimator, TransformerMixin):
    def __init__(self, add_location=True):
        self.add_location = add_location
        
    def fit(self, X, y=None):
        return None
    
    def transform(self, X):
        X["NetCapital"] = X["CapitalGains"] + X["CapitalLoss"]
        
        if add_location:
            X["IncidentArea"] = X["IncidentState"] + " " + X["IncidentCity"]
            
        return X

In [7]:
class RemoveMissingValues(BaseEstimator, TransformerMixin):
    def __init__(self, incident_time=True, witnesses=True, totalclaim=True):
        self.incident_time = incident_time
        self.witnesses = witnesses
        self.totalclaim = totalclaim
        
    def fit(self, X, y=None):
        return None
    
    def transform(self, X, y=None):
        if self.incident_time:
            time_missing = X[X["IncidentTime"] == -5].index
            X, y = X.drop(time_missing), y.drop(time_missing)
            
        if self.witnesses:
            witnesses_missing = X[X["Witnesses"] == "MISSINGVALUE"].index
            X, y= X.drop(witnesses_missing), y.drop(witnesses_missing)
        
        if self.totalclaim:
            totalclaim_missing = X[X["AmountOfTotalClaim"] == "MISSEDDATA"].index
            X, y = X.drop(totalclaim_missing), y.drop(totalclaim_missing)
        
        return X, y

In [11]:
class OutlierManager(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return None
    
    def transform(self, X, y):
        policyprem_outliers = X[X["PolicyAnnualPremium"] < 0].index
        X, y = X.drop(policyprem_outliers), y.drop(policyprem_outliers)
        
        return X, y

In [13]:
#x_train = x_train.drop(["InsuredGender"], axis=1)
#x_train = x_train.drop(["Country"], axis=1)

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [20]:
totalclaim_missing_idx = x_train[x_train["AmountOfTotalClaim"] == "MISSEDDATA"].index
x_train, y_train = x_train.drop(totalclaim_missing_idx), y_train.drop(totalclaim_missing_idx)

x_train["NetCapital"] = x_train["CapitalGains"] + x_train["CapitalLoss"]
x_train["AmountOfTotalClaim"] = x_train["AmountOfTotalClaim"].astype('int')

In [21]:
totalclaim_missing_idx_test = x_test[x_test["AmountOfTotalClaim"] == "MISSEDDATA"].index
x_test, y_test = x_test.drop(totalclaim_missing_idx_test), y_test.drop(totalclaim_missing_idx_test)

x_test["NetCapital"] = x_test["CapitalGains"] + x_test["CapitalLoss"]
x_test["AmountOfTotalClaim"] = x_test["AmountOfTotalClaim"].astype('int')

In [22]:
cat_attributes = ['IncidentTime',
                  'Witnesses',
                  'SeverityOfIncident', 
                 'AuthoritiesContacted', 
                 'PropertyDamage', 
                 'InsuredOccupation', 
                 'InsuredHobbies', 
                 'DateOfPolicyCoverage', 
                 'Policy_CombinedSingleLimit', 
                 'InsuredRelationship',
                 ]

num_attributes = ['NumberOfVehicles',
             'BodilyInjuries',
             'AmountOfTotalClaim',
             'AmountOfInjuryClaim',
             'AmountOfPropertyClaim',
             'AmountOfVehicleDamage',
             'InsuredAge',
             'CapitalGains',
             'CapitalLoss',
             'InsurancePolicyNumber',
             'CustomerLoyaltyPeriod',
             'Policy_Deductible',
             'PolicyAnnualPremium',
             'NetCapital']

cat_idx = [x_train.columns.get_loc(i) for i in cat_attributes]
num_idx = [x_train.columns.get_loc(i) for i in num_attributes]
inc_time_idx = x_train.columns.get_loc('IncidentTime')
wit_idx = x_train.columns.get_loc('Witnesses')

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# cat_attributes
# num_attributes

#("1Hot Cat Encoder", OneHotEncoder(), cat_attributes)

lb_train = LabelBinarizer()
lb_test = LabelBinarizer()

y_train, y_test = pd.DataFrame(lb_train.fit_transform(y_train)), pd.DataFrame(lb_test.fit_transform(y_test))

cat_transformer = Pipeline([
    ("impute_incident_time", SimpleImputer(missing_values=-5, strategy='most_frequent')), 
    ("impute_witnesses", SimpleImputer(missing_values="MISSINGVALUE", strategy='most_frequent')),
    ("1h_encoder", OneHotEncoder(handle_unknown='ignore'))]
)

num_transformer = Pipeline([
    ("std_scaler", StandardScaler())]
)


full_pipeline = ColumnTransformer(
    transformers = [
        ("num", num_transformer, num_attributes), 
        ("cat", cat_transformer, cat_attributes)
    ]
)

data_prepared = full_pipeline.fit_transform(x_train)

In [24]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(random_state=42)

In [25]:
tree_clf.fit(data_prepared, y_train)

DecisionTreeClassifier(random_state=42)

In [26]:
data_test = full_pipeline.transform(x_test)

In [27]:
tree_clf.score(data_test, y_test)

0.9055227509551927

In [28]:
y_test.value_counts(normalize=True)

0    0.729941
1    0.270059
dtype: float64

In [29]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelBinarizer


fraud_predictions = tree_clf.predict(data_test)

lb = LabelBinarizer()
lb2 = LabelBinarizer()

labels_bin = lb.fit_transform(y_test)
pred_bin = lb2.fit_transform(fraud_predictions)

tree_mse = mean_squared_error(labels_bin, pred_bin)
tree_rmse = np.sqrt(tree_mse)

In [30]:
tree_mse

0.09447724904480723

In [31]:
tree_rmse

0.3073715163199206

In [32]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_clf, data_prepared, y_train, scoring="neg_mean_squared_error", cv=5)

tree_rmse_scores = np.sqrt(-scores)

In [33]:
tree_rmse_scores

array([0.3177346 , 0.30518746, 0.31739277, 0.3181106 , 0.30699411])