## Imports

In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix

## Data Loading

In [2]:
DATA_PATH = "/auto-insurance-fall-2017"
SETS_PATHS = {"train": "/train_auto.csv",
             "test": "/test_auto.csv"}

data = {}

for key in SETS_PATHS.keys():
    path = os.getcwd() + DATA_PATH + SETS_PATHS[key]
    df = pd.read_csv(path)
    data[key] = df

## Explore data

In [3]:
print(data["train"].shape)
data["train"].head()

(8161, 26)


Unnamed: 0,INDEX,TARGET_FLAG,TARGET_AMT,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,...,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CAR_AGE,URBANICITY
0,1,0,0.0,0,60.0,0,11.0,"$67,349",No,$0,...,"$14,230",11,Minivan,yes,"$4,461",2,No,3,18.0,Highly Urban/ Urban
1,2,0,0.0,0,43.0,0,11.0,"$91,449",No,"$257,252",...,"$14,940",1,Minivan,yes,$0,0,No,0,1.0,Highly Urban/ Urban
2,4,0,0.0,0,35.0,1,10.0,"$16,039",No,"$124,191",...,"$4,010",4,z_SUV,no,"$38,690",2,No,3,10.0,Highly Urban/ Urban
3,5,0,0.0,0,51.0,0,14.0,,No,"$306,251",...,"$15,440",7,Minivan,yes,$0,0,No,0,6.0,Highly Urban/ Urban
4,6,0,0.0,0,50.0,0,,"$114,986",No,"$243,925",...,"$18,000",1,z_SUV,no,"$19,217",2,Yes,3,17.0,Highly Urban/ Urban


In [4]:
data["train"].columns

Index(['INDEX', 'TARGET_FLAG', 'TARGET_AMT', 'KIDSDRIV', 'AGE', 'HOMEKIDS',
       'YOJ', 'INCOME', 'PARENT1', 'HOME_VAL', 'MSTATUS', 'SEX', 'EDUCATION',
       'JOB', 'TRAVTIME', 'CAR_USE', 'BLUEBOOK', 'TIF', 'CAR_TYPE', 'RED_CAR',
       'OLDCLAIM', 'CLM_FREQ', 'REVOKED', 'MVR_PTS', 'CAR_AGE', 'URBANICITY'],
      dtype='object')

In [5]:
data["train"].describe()

Unnamed: 0,INDEX,TARGET_FLAG,TARGET_AMT,KIDSDRIV,AGE,HOMEKIDS,YOJ,TRAVTIME,TIF,CLM_FREQ,MVR_PTS,CAR_AGE
count,8161.0,8161.0,8161.0,8161.0,8155.0,8161.0,7707.0,8161.0,8161.0,8161.0,8161.0,7651.0
mean,5151.867663,0.263816,1504.324648,0.171057,44.790313,0.721235,10.499286,33.485725,5.351305,0.798554,1.695503,8.328323
std,2978.893962,0.440728,4704.02693,0.511534,8.627589,1.116323,4.092474,15.908333,4.146635,1.158453,2.147112,5.700742
min,1.0,0.0,0.0,0.0,16.0,0.0,0.0,5.0,1.0,0.0,0.0,-3.0
25%,2559.0,0.0,0.0,0.0,39.0,0.0,9.0,22.0,1.0,0.0,0.0,1.0
50%,5133.0,0.0,0.0,0.0,45.0,0.0,11.0,33.0,4.0,0.0,1.0,8.0
75%,7745.0,1.0,1036.0,0.0,51.0,1.0,13.0,44.0,7.0,2.0,3.0,12.0
max,10302.0,1.0,107586.13616,4.0,81.0,5.0,23.0,142.0,25.0,5.0,13.0,28.0


At this point we see that a lot of features are categorical, and that prices are formated as strings.

We also know (from the feature mean) that TARGET_FLAG is unbalanced.

In [6]:
# Here I inspected non-numerical features to check which were categorical and which were prices
# This revealed that there are somme missing data in "JOB"

# data["train"]["JOB"].values
np.unique(data["train"]["URBANICITY"].values)

array(['Highly Urban/ Urban', 'z_Highly Rural/ Rural'], dtype=object)

We should also evaluate the ammount of missing data

In [7]:
# Evaluate the ammount of missing data
print(data["train"].isna().sum(axis=0))

INDEX            0
TARGET_FLAG      0
TARGET_AMT       0
KIDSDRIV         0
AGE              6
HOMEKIDS         0
YOJ            454
INCOME         445
PARENT1          0
HOME_VAL       464
MSTATUS          0
SEX              0
EDUCATION        0
JOB            526
TRAVTIME         0
CAR_USE          0
BLUEBOOK         0
TIF              0
CAR_TYPE         0
RED_CAR          0
OLDCLAIM         0
CLM_FREQ         0
REVOKED          0
MVR_PTS          0
CAR_AGE        510
URBANICITY       0
dtype: int64


## Preprocessing

In [8]:
# get targets and indices before preprocessing features
train_targets = data["train"]["TARGET_FLAG"].values
train_indices = data["train"]["INDEX"]
test_indices = data["test"]["INDEX"]

I apply the following preprocessings:

- turn prices to numerical data

- replace missing values in YOJ, INCOME, HOME_VAL, CAR_AGE, AGE by the average value (loosing 1/4 of the dataset seemed to costly so I didn't discard them)

- consider "Unknown" as a job type to handle missing values in "JOB"

- 1-hot encode categoricals values

- Normalize features

In [9]:
def price_to_float(value):
    "Converts prices (strings starting by $ into floats) without changing other values"
    try:
        if value.startswith('$'):
            return float(value[1:].replace(',','.'))
        else:
            return value
    except:
        return value

In [10]:
# Apply all the transforms on both train and test data:

for mode in data.keys():
    # Remove targets from features
    data[mode].drop(["TARGET_FLAG", "TARGET_AMT"], axis=1, inplace=True)
    
    # Convert prices
    prices_features = ['INCOME', 'HOME_VAL', 'OLDCLAIM', 'BLUEBOOK']
    for feature in prices_features:
        data[mode][feature] = data[mode][feature].apply(price_to_float)
    
    # Replace missing prices/age values with mean
    data[mode].fillna(data[mode].mean(), inplace=True)
    
    # Make missing job information a new category
    data[mode]["JOB"].replace({float('nan'): "Unknown"}, inplace=True)
    
    # One-hot encode categorical features
    # drop_first prevents us from duplicating 
    data[mode] = pd.get_dummies(data[mode], drop_first=True)

In [11]:
# Normalize dataframes

train_mean = data["train"].mean()
train_std = data["train"].std()

data["train"] =(data["train"]-train_mean)/train_std
data["test"] =(data["test"]-train_mean)/train_std

In [12]:
# Check for biases in test dataset
print(data["train"].mean() - data["test"].mean())

INDEX                               0.000594
KIDSDRIV                            0.016649
AGE                                -0.026264
HOMEKIDS                            0.003416
YOJ                                 0.030223
INCOME                              0.070127
HOME_VAL                            0.013155
TRAVTIME                            0.020961
BLUEBOOK                            0.028561
TIF                                 0.025698
OLDCLAIM                           -0.020342
CLM_FREQ                           -0.008989
MVR_PTS                            -0.032832
CAR_AGE                             0.026256
PARENT1_Yes                         0.022832
MSTATUS_z_No                        0.009610
SEX_z_F                            -0.020828
EDUCATION_Bachelors                 0.007511
EDUCATION_Masters                   0.017375
EDUCATION_PhD                      -0.024599
EDUCATION_z_High School            -0.011101
JOB_Doctor                         -0.028580
JOB_Home M

The mean values for each feature is similar in the train and test sets.

## Feature Engineering

Here I would design other features or select the most relevant ones if needed. (I didn't have time to do so in the 2 hours)

## Tring different models and assessing theire performances 

In [13]:
def mean_cross_val_score(features, targets, model):
    "Computes the cross_validation score on a 5-fold basis"
    scores = cross_val_score(model, features, targets)
    return scores.mean()

def plot_confusion_matrix(features, targets, model):
    "Computes confusion matrices for a single fold"
    train_feats = features[:6000]
    test_feats = features[6000:]
    train_targets = targets[:6000]
    test_targets = targets[6000:]
    model.fit(train_feats, train_targets)
    preds = model.predict(test_feats)
    model_confusion = confusion_matrix(test_targets, preds, normalize='all')
    return(model_confusion)

train_features = data["train"].values

# Models I have tested don't get above 0.79 accuracy
# I plotted the confusion matrix to check the relevance of the predictions
# The choice of balancing the classes depends on the relevant metric for the project

models = {
    "regression_model": LogisticRegression(class_weight="balanced"),
    "svc_model": SVC(class_weight="balanced"),
    "randomForest_model": RandomForestClassifier(class_weight="balanced"),
    "adaboost_model": AdaBoostClassifier(),
}

for model_name in models.keys():
    model = models[model_name]
    print(f"---{model_name}---")
    print(mean_cross_val_score(train_features, train_targets, model))
    print(plot_confusion_matrix(train_features, train_targets, model))

---regression_model---
0.7174362564989253
[[0.50670986 0.23970384]
 [0.0532161  0.2003702 ]]
---svc_model---
0.7261363363471537
[[0.52429431 0.22211939]
 [0.06339658 0.19018973]]
---randomForest_model---
0.77943930634103
[[0.71957427 0.02683943]
 [0.18880148 0.06478482]]
---adaboost_model---
0.7778456437688364
[[0.66728366 0.07913003]
 [0.14298936 0.11059695]]


I assumed that the main objective was to maximise the True Positive rate (predict when a client will demand a compensation), so I used the logistic regression model with class weights for the predictions.

## Predictions

In [14]:
final_model = LogisticRegression(class_weight="balanced")

final_model.fit(data["train"], train_targets)

predictions = final_model.predict(data["test"])

predictions_df = pd.DataFrame({"predictions": predictions}, index=test_indices)

predictions_df.to_csv("test_predictions.csv")

In [15]:
# Check the balance of predictions in the test dataset:
print(predictions_df["predictions"].mean())

0.4348435310602522
