# Late payments

## Data scientist position hometask: payments latency probability prediction

In [1]:
import os
import operator
import datetime

import numpy as np
import pandas as pd

from dateutil.relativedelta import relativedelta

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 120)
pd.set_option('display.max_columns', 200)

### Reading & parsing CSV

In [3]:
data_dir = "data"
invoice_dataset_file_name = "InvoiceData.csv"
policy_dataset_file_name = "PolicyData.csv"

invoice_dataset_path = os.path.join(data_dir, invoice_dataset_file_name)
policy_dataset_path = os.path.join(data_dir, policy_dataset_file_name)

In [4]:
policy_df = pd.read_csv(
    policy_dataset_path, encoding="ISO-8859-1", sep=";",
    parse_dates=["ClientBirthday", "PolicyIssueDate", "PolicyStartDate", "PolicyEndDate", "PolicyActualEndDate"],
#     index_col="policy_guid", # weird bug, will be set later..
)

invoice_df = pd.read_csv(
    invoice_dataset_path, encoding="ISO-8859-1", sep=";", parse_dates=["due_date", "paid_date"], index_col="invoice_guid"
)

In [5]:
policy_df["Premium"] = policy_df["Premium"].str.replace(",", ".").astype(float)

invoice_df["amount_premium"] = invoice_df["amount_premium"].str.replace(",", ".").astype(float)

### Overall stats

In [6]:
print("policy_df.shape:\t{}".format(policy_df.shape))
print("invoice_df.shape:\t{}".format(invoice_df.shape))

policy_df.shape:	(68638, 39)
invoice_df.shape:	(358850, 5)


In [7]:
policy_df.head()

Unnamed: 0,policy_guid,Country,VehicleType,VehicleUsage,Power,Weight,VehicleFirstRegistrationYear,Mark,Model,Deductible_general,Fire,Theft,Natural_disasters,Road_assistance,Total_loss,Vandalism,Glass,Accident,Replacement_car,Gender,ClientBirthday,Region,BMClassMOD,Leasing,PolicyIssueDate,PolicyStartDate,PolicyEndDate,PolicyActualEndDate,Nb_of_payments,Premium,Channel,FuelType,avgFuelConsumption,DriveTrain,RenewalIndicator,IsRenewed,BalticRating,Terminated,sales_type
0,{E4FD3CD8-4141-4E55-A74C-C983ED7EFD8E},EE,Passenger car,Regular,108,1895,2007,TOYOTA,AVENSIS,191.0,1,1,1,0,1,1,0,1,0,M,1983-10-19,HARJUMAA,0,0,2015-09-13,2015-09-13,2016-09-12,2016-05-24,12,261.09,Lean Operator,Gasoline,8.0,Front,0,0,3,1,Renewal
1,{CD63AD22-81C2-4A86-8E56-AF1084BE49E3},EE,Passenger car,Regular,120,2050,2008,VOLVO,S60,191.0,1,1,1,0,1,1,0,1,0,M,1959-08-11,PÄRNUMAA,0,0,2015-09-23,2015-09-23,2016-09-22,2015-10-22,12,297.22,Lean Operator,Diesel,7.0,Front,0,1,0,1,Renewal
2,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},EE,Passenger car,Regular,125,1970,2012,VOLKSWAGEN,CC,191.0,1,1,1,1,1,1,1,1,0,M,1966-10-17,HARJUMAA,0,1,2015-09-14,2015-09-14,2016-09-13,2016-09-13,4,534.22,Direct,Diesel,5.0,Front,0,0,0,0,New sale
3,{74A1885E-CC4D-435B-B7CF-CAD0287FA814},EE,Passenger car,Regular,85,2510,2007,VOLKSWAGEN,SHARAN,191.0,1,1,1,1,1,1,1,1,0,M,1954-03-08,TARTUMAA,A1,0,2015-09-20,2015-09-20,2016-09-19,2015-10-04,12,275.24,Direct,Missing,,Missing,0,1,5,1,New sale
4,{FF6D78C8-B660-4D58-8A9D-0FD6E32E2A03},EE,Passenger car,Regular,77,1780,2013,SKODA,OCTAVIA,191.0,1,0,1,1,1,1,1,1,0,M,1944-07-30,JÕGEVAMAA,0,0,2015-11-23,2015-11-23,2016-11-22,2016-04-22,12,239.77,Direct,Gasoline,,Missing,0,0,2,1,New sale


In [8]:
invoice_df.head()

Unnamed: 0_level_0,payment_status,policy_guid,amount_premium,due_date,paid_date
invoice_guid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
{55B3ABD8-B64E-498E-A617-3C6E88C4D06F},Paid,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},133.55,2015-12-13,2015-12-14
{40544861-68D0-462F-AF9C-8740B2316620},Paid,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},133.55,2016-03-13,2016-03-14
{B3C6B66F-280E-413B-8FEE-A804903D2FE8},Paid,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},133.57,2015-09-14,2015-09-14
{674FDC26-D0E8-4C20-BB86-1AE9979EB1AA},Paid,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},133.55,2016-06-13,2016-06-13
{EE21ECF7-7B70-4DA3-B663-87371073DE18},Paid,{74A1885E-CC4D-435B-B7CF-CAD0287FA814},23.01,2015-09-19,2015-09-21


In [9]:
policy_df.dtypes

policy_guid                             object
Country                                 object
VehicleType                             object
VehicleUsage                            object
Power                                    int64
Weight                                   int64
VehicleFirstRegistrationYear             int64
Mark                                    object
Model                                   object
Deductible_general                     float64
Fire                                     int64
Theft                                    int64
Natural_disasters                        int64
Road_assistance                          int64
Total_loss                               int64
Vandalism                                int64
Glass                                    int64
Accident                                 int64
Replacement_car                          int64
Gender                                  object
ClientBirthday                  datetime64[ns]
Region       

In [10]:
invoice_df.dtypes

payment_status            object
policy_guid               object
amount_premium           float64
due_date          datetime64[ns]
paid_date         datetime64[ns]
dtype: object

In [11]:
def categorical_stats(df):
    values_df = df.select_dtypes(include="object").nunique().to_frame(name="unique")
    values_df["values"] = values_df.apply(lambda item: df[item.name].unique().tolist(), axis=1)
    return values_df

In [12]:
categorical_stats(policy_df)

Unnamed: 0,unique,values
policy_guid,68638,"[{E4FD3CD8-4141-4E55-A74C-C983ED7EFD8E}, {CD63AD22-81C2-4A86-8E56-AF1084BE49E3}, {480DDB89-BA11-4219-A92C-330ABC6BE1..."
Country,1,[EE]
VehicleType,7,"[Passenger car, Van, Bus, Lorry, Truck, Motorcycle, Tractor]"
VehicleUsage,10,"[Regular, Taxi, Remote Line, Other, Short-term rental, Operative Transport, Dangerous Cargo, ATV, Agricultural Machi..."
Mark,69,"[TOYOTA, VOLVO, VOLKSWAGEN, SKODA, OPEL, FORD, HONDA, NISSAN, MAZDA, SUBARU, MERCEDES BENZ, LEXUS, AUDI, PEUGEOT, KI..."
Model,657,"[AVENSIS, S60, CC, SHARAN, OCTAVIA, ASTRA, XC90, S-MAX, CR V, FABIA, QASHQAI, FOCUS, PASSAT, 6, LAND CRUISER, LEGACY..."
Gender,3,"[M, F, L]"
Region,16,"[HARJUMAA, PÄRNUMAA, TARTUMAA, JÕGEVAMAA, IDA-VIRUMAA, LÄÄNE-VIRUMAA, JÄRVAMAA, VÕRUMAA, RAPLAMAA, VALGAMAA, HIIUMAA..."
BMClassMOD,21,"[0, A1, A2, P1, P6, P2, P10, P3, P4, A4, A3, A9, A10, nan, A5, A7, P7, A6, P5, A8, P8, P9]"
Channel,6,"[Lean Operator, Direct, Partner, Internet, Unknown, Broker]"


In [13]:
categorical_stats(invoice_df)

Unnamed: 0,unique,values
payment_status,2,"[Paid, Canceled and paid]"
policy_guid,68638,"[{480DDB89-BA11-4219-A92C-330ABC6BE1EC}, {74A1885E-CC4D-435B-B7CF-CAD0287FA814}, {56A693C1-4AE0-41F1-8EE3-C650B595DA..."


In [14]:
policy_df.isnull().sum()

policy_guid                         0
Country                             0
VehicleType                         0
VehicleUsage                        0
Power                               0
Weight                              0
VehicleFirstRegistrationYear        0
Mark                                0
Model                               0
Deductible_general                  2
Fire                                0
Theft                               0
Natural_disasters                   0
Road_assistance                     0
Total_loss                          0
Vandalism                           0
Glass                               0
Accident                            0
Replacement_car                     0
Gender                              0
ClientBirthday                     43
Region                              0
BMClassMOD                         16
Leasing                             0
PolicyIssueDate                     0
PolicyStartDate                     0
PolicyEndDat

In [15]:
invoice_df.isnull().sum()

payment_status    0
policy_guid       0
amount_premium    0
due_date          0
paid_date         0
dtype: int64

### Cleanup

In [16]:
# zero entropy column
policy_df.drop(columns=["Country"], inplace=True)

# rows with nan values
policy_df.dropna(subset=["Deductible_general", "ClientBirthday", "BMClassMOD"], inplace=True)

# column with too much nans
policy_df.drop(columns=["avgFuelConsumption"], inplace=True)

### Transformations

In [17]:
# assigning target variable
invoice_df["late"] = invoice_df.apply(lambda item: item["paid_date"] > item["due_date"], axis=1).astype(int)

In [18]:
invoice_df.head()

Unnamed: 0_level_0,payment_status,policy_guid,amount_premium,due_date,paid_date,late
invoice_guid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
{55B3ABD8-B64E-498E-A617-3C6E88C4D06F},Paid,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},133.55,2015-12-13,2015-12-14,1
{40544861-68D0-462F-AF9C-8740B2316620},Paid,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},133.55,2016-03-13,2016-03-14,1
{B3C6B66F-280E-413B-8FEE-A804903D2FE8},Paid,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},133.57,2015-09-14,2015-09-14,0
{674FDC26-D0E8-4C20-BB86-1AE9979EB1AA},Paid,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},133.55,2016-06-13,2016-06-13,0
{EE21ECF7-7B70-4DA3-B663-87371073DE18},Paid,{74A1885E-CC4D-435B-B7CF-CAD0287FA814},23.01,2015-09-19,2015-09-21,1


In [19]:
print("late payments number:\t{}".format(invoice_df["late"].sum()))
print("late payments fraction:\t{:.3f}".format(invoice_df["late"].mean()))

late payments number:	59118
late payments fraction:	0.165


In [20]:
grouped_invoice_df = invoice_df.groupby("policy_guid")["late"].agg(["sum", "count"])

In [21]:
grouped_invoice_df.head(10)

Unnamed: 0_level_0,sum,count
policy_guid,Unnamed: 1_level_1,Unnamed: 2_level_1
{00002943-8C7A-4293-9715-0E4F09C6311D},0,1
{000118B2-9E9C-4A98-A338-DC31DD7C1F59},1,12
{0001A384-9AAC-44DF-BE77-D247F880B80B},1,1
{0001C1D4-FB9E-4DD3-A539-1F53C3E2BDD9},1,1
{00027C87-4647-41EB-A406-6EB60D5D83D8},0,12
{00030555-C9C4-4343-A38D-EDF0FE229912},1,12
{0003A96C-442D-4005-B467-E9F0D8F0DF60},0,1
{0003AF5E-7349-401F-A43A-79CC29EE8848},2,2
{0003E2FF-D68D-49C5-AB97-A1817382F506},5,7
{00049948-CDDF-402F-8341-4C8A3BBFA667},1,4


In [22]:
grouped_invoice_df.rename(columns={"sum": "late_count", "count": "payments_total"}, inplace=True)
grouped_invoice_df.head()

Unnamed: 0_level_0,late_count,payments_total
policy_guid,Unnamed: 1_level_1,Unnamed: 2_level_1
{00002943-8C7A-4293-9715-0E4F09C6311D},0,1
{000118B2-9E9C-4A98-A338-DC31DD7C1F59},1,12
{0001A384-9AAC-44DF-BE77-D247F880B80B},1,1
{0001C1D4-FB9E-4DD3-A539-1F53C3E2BDD9},1,1
{00027C87-4647-41EB-A406-6EB60D5D83D8},0,12


In [23]:
policy_df = policy_df.join(grouped_invoice_df, on="policy_guid")

In [24]:
# for the sake of brevity
df = policy_df

In [25]:
df.head()

Unnamed: 0,policy_guid,VehicleType,VehicleUsage,Power,Weight,VehicleFirstRegistrationYear,Mark,Model,Deductible_general,Fire,Theft,Natural_disasters,Road_assistance,Total_loss,Vandalism,Glass,Accident,Replacement_car,Gender,ClientBirthday,Region,BMClassMOD,Leasing,PolicyIssueDate,PolicyStartDate,PolicyEndDate,PolicyActualEndDate,Nb_of_payments,Premium,Channel,FuelType,DriveTrain,RenewalIndicator,IsRenewed,BalticRating,Terminated,sales_type,late_count,payments_total
0,{E4FD3CD8-4141-4E55-A74C-C983ED7EFD8E},Passenger car,Regular,108,1895,2007,TOYOTA,AVENSIS,191.0,1,1,1,0,1,1,0,1,0,M,1983-10-19,HARJUMAA,0,0,2015-09-13,2015-09-13,2016-09-12,2016-05-24,12,261.09,Lean Operator,Gasoline,Front,0,0,3,1,Renewal,6,9
1,{CD63AD22-81C2-4A86-8E56-AF1084BE49E3},Passenger car,Regular,120,2050,2008,VOLVO,S60,191.0,1,1,1,0,1,1,0,1,0,M,1959-08-11,PÄRNUMAA,0,0,2015-09-23,2015-09-23,2016-09-22,2015-10-22,12,297.22,Lean Operator,Diesel,Front,0,1,0,1,Renewal,0,2
2,{480DDB89-BA11-4219-A92C-330ABC6BE1EC},Passenger car,Regular,125,1970,2012,VOLKSWAGEN,CC,191.0,1,1,1,1,1,1,1,1,0,M,1966-10-17,HARJUMAA,0,1,2015-09-14,2015-09-14,2016-09-13,2016-09-13,4,534.22,Direct,Diesel,Front,0,0,0,0,New sale,2,4
3,{74A1885E-CC4D-435B-B7CF-CAD0287FA814},Passenger car,Regular,85,2510,2007,VOLKSWAGEN,SHARAN,191.0,1,1,1,1,1,1,1,1,0,M,1954-03-08,TARTUMAA,A1,0,2015-09-20,2015-09-20,2016-09-19,2015-10-04,12,275.24,Direct,Missing,Missing,0,1,5,1,New sale,1,2
4,{FF6D78C8-B660-4D58-8A9D-0FD6E32E2A03},Passenger car,Regular,77,1780,2013,SKODA,OCTAVIA,191.0,1,0,1,1,1,1,1,1,0,M,1944-07-30,JÕGEVAMAA,0,0,2015-11-23,2015-11-23,2016-11-22,2016-04-22,12,239.77,Direct,Gasoline,Missing,0,0,2,1,New sale,0,6


In [26]:
def days_between(d1, d2):
    rd = relativedelta(d1, d2)
    now = datetime.datetime.now()
    then = now - rd
    diff = now - then
    return diff.days

In [27]:
df["Policy_pre_lifetime_days"] = df.apply(
    lambda item: days_between(item["PolicyStartDate"], item["PolicyIssueDate"]), axis=1)

df["Policy_lifetime_days"] = df.apply(
    lambda item: days_between(item["PolicyEndDate"], item["PolicyStartDate"]), axis=1)

df["Policy_actual_lifetime_days"] = df.apply(
    lambda item: days_between(item["PolicyActualEndDate"], item["PolicyStartDate"]), axis=1)

In [28]:
df["Client_age_on_issue"] = df.apply(
    lambda item: relativedelta(item["PolicyIssueDate"], item["ClientBirthday"]).years, axis=1)

df.drop(labels=["ClientBirthday"], axis=1, inplace=True)

In [29]:
def decompose_date(df, column, drop_source=False):
    df[column + "_y"] = df.apply(lambda item: item[column].year, axis=1)
    df[column + "_m"] = df.apply(lambda item: item[column].month, axis=1)
    df[column + "_d"] = df.apply(lambda item: item[column].day, axis=1)
    
    if drop_source:
        df.drop(labels=[column], axis=1, inplace=True)

    return df

In [30]:
date_columns = [
    "PolicyIssueDate",
    "PolicyStartDate",
    "PolicyEndDate",
    "PolicyActualEndDate"
]

for column in date_columns:
    df = decompose_date(df, column=column, drop_source=True)

In [31]:
def add_dummies(df, column, prefix=None, prefix_sep='_', drop_first=False):
    dummy_df = pd.get_dummies(df[column], prefix=prefix, prefix_sep=prefix_sep, drop_first=drop_first)
    df = pd.concat([df, dummy_df], axis=1)
    df.drop(labels=[column], axis=1, inplace=True)
    return df

In [32]:
categorical_columns = [
    "VehicleType", 
    "VehicleUsage",
    "Gender",
    "Region",
    "BMClassMOD",
    "Channel",
    "FuelType",
    "DriveTrain",
    "sales_type"
]

for column in categorical_columns:
    df = add_dummies(df, column=column, prefix=column, drop_first=True)

In [33]:
df.set_index("policy_guid", inplace=True)

In [34]:
df["late_count"].nunique()

15

In [35]:
sorted(df["late_count"].unique().tolist())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [36]:
# so we have a multi-class problem..

### Training. Approach 1. Logistic regression

In [37]:
backup_df = df.copy()

#### Last specific preprocessing

In [38]:
# this was not done previously
df = add_dummies(df, column="Mark", prefix="Mark", drop_first=True)

In [39]:
# LogisticRegression fitting works pretty slow, so we will do some simplifications:

# 1. make problem binary
df["late_count"] = df["late_count"].astype(bool).astype(int)

# 2. drop "Model" field to get rid of 600+ one-hot features
df.drop(labels=["Model"], axis=1, inplace=True)

In [40]:
print("dataset dimensions now: {}".format(df.shape))

dataset dimensions now: (68579, 173)


In [41]:
def set_target_last(df, target):
    cols = df.columns.values.tolist()[:]
    cols.remove(target)
    return df[cols + [target]]

In [42]:
df = set_target_last(df, target="late_count")

In [43]:
df.head()

Unnamed: 0_level_0,Power,Weight,VehicleFirstRegistrationYear,Deductible_general,Fire,Theft,Natural_disasters,Road_assistance,Total_loss,Vandalism,Glass,Accident,Replacement_car,Leasing,Nb_of_payments,Premium,RenewalIndicator,IsRenewed,BalticRating,Terminated,payments_total,Policy_pre_lifetime_days,Policy_lifetime_days,Policy_actual_lifetime_days,Client_age_on_issue,PolicyIssueDate_y,PolicyIssueDate_m,PolicyIssueDate_d,PolicyStartDate_y,PolicyStartDate_m,PolicyStartDate_d,PolicyEndDate_y,PolicyEndDate_m,PolicyEndDate_d,PolicyActualEndDate_y,PolicyActualEndDate_m,PolicyActualEndDate_d,VehicleType_Lorry,VehicleType_Motorcycle,VehicleType_Passenger car,VehicleType_Tractor,VehicleType_Truck,VehicleType_Van,VehicleUsage_Agricultural Machinery,VehicleUsage_Dangerous Cargo,VehicleUsage_Forest Machinery,VehicleUsage_Operative Transport,VehicleUsage_Other,VehicleUsage_Regular,VehicleUsage_Remote Line,VehicleUsage_Short-term rental,VehicleUsage_Taxi,Gender_M,Region_HIIUMAA,Region_IDA-VIRUMAA,Region_JÄRVAMAA,Region_JÕGEVAMAA,Region_LÄÄNE-VIRUMAA,Region_LÄÄNEMAA,Region_Missing,Region_PÄRNUMAA,Region_PÕLVAMAA,Region_RAPLAMAA,Region_SAAREMAA,Region_TARTUMAA,Region_VALGAMAA,Region_VILJANDIMAA,Region_VÕRUMAA,BMClassMOD_A1,BMClassMOD_A10,BMClassMOD_A2,BMClassMOD_A3,BMClassMOD_A4,BMClassMOD_A5,BMClassMOD_A6,BMClassMOD_A7,BMClassMOD_A8,BMClassMOD_A9,BMClassMOD_P1,BMClassMOD_P10,BMClassMOD_P2,BMClassMOD_P3,BMClassMOD_P4,BMClassMOD_P5,BMClassMOD_P6,BMClassMOD_P7,BMClassMOD_P8,BMClassMOD_P9,Channel_Direct,Channel_Internet,Channel_Lean Operator,Channel_Partner,Channel_Unknown,FuelType_Diesel,FuelType_Diesel-Hybrid,FuelType_Electricity,FuelType_Gasoline,FuelType_Gasoline-Hybrid,FuelType_Missing,DriveTrain_Front,DriveTrain_Missing,DriveTrain_Rear,sales_type_Renewal,sales_type_Upsale,Mark_APRILIA,Mark_AUDI,Mark_BMW,Mark_BUELL,Mark_CADILLAC,Mark_CFMOTO,Mark_CHEVROLET,Mark_CHRYSLER,Mark_CITROEN,Mark_DACIA,Mark_DAF,Mark_DODGE,Mark_DUCATI,Mark_FENDT,Mark_FIAT,Mark_FORD,Mark_GMC,Mark_HARLEY-DAVIDS,Mark_HONDA,Mark_HUMMER,Mark_HYUNDAI,Mark_INFINITI,Mark_ISUZU,Mark_IVECO,Mark_JAGUAR,Mark_JEEP,Mark_JOHN DEERE,Mark_KAWASAKI,Mark_KIA,Mark_KTM,Mark_LADA,Mark_LANCIA,Mark_LAND ROVER,Mark_LEXUS,Mark_LINCOLN,Mark_MAN,Mark_MASSEY FERGUS,Mark_MAZDA,Mark_MERCEDES BENZ,Mark_MICROCAR,Mark_MINI,Mark_MITSUBISHI,Mark_MOTO STAR,Mark_NEW HOLLAND,Mark_NISSAN,Mark_OPEL,Mark_OTHER,Mark_PEUGEOT,Mark_PORSCHE,Mark_RENAULT,Mark_ROVER,Mark_RSM,Mark_SAAB,Mark_SATURN,Mark_SCANIA,Mark_SCANIA IRIZAR,Mark_SEAT,Mark_SKODA,Mark_SSANGYONG,Mark_SUBARU,Mark_SUZUKI,Mark_TEMSA,Mark_TOYOTA,Mark_VALTRA,Mark_VAZ,Mark_VOLKSWAGEN,Mark_VOLVO,Mark_YAMAHA,late_count
policy_guid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1
{E4FD3CD8-4141-4E55-A74C-C983ED7EFD8E},108,1895,2007,191.0,1,1,1,0,1,1,0,1,0,0,12,261.09,0,0,3,1,9,0,365,254,31,2015,9,13,2015,9,13,2016,9,12,2016,5,24,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
{CD63AD22-81C2-4A86-8E56-AF1084BE49E3},120,2050,2008,191.0,1,1,1,0,1,1,0,1,0,0,12,297.22,0,1,0,1,2,0,365,29,56,2015,9,23,2015,9,23,2016,9,22,2015,10,22,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
{480DDB89-BA11-4219-A92C-330ABC6BE1EC},125,1970,2012,191.0,1,1,1,1,1,1,1,1,0,1,4,534.22,0,0,0,0,4,0,365,365,48,2015,9,14,2015,9,14,2016,9,13,2016,9,13,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
{74A1885E-CC4D-435B-B7CF-CAD0287FA814},85,2510,2007,191.0,1,1,1,1,1,1,1,1,0,0,12,275.24,0,1,5,1,2,0,365,14,61,2015,9,20,2015,9,20,2016,9,19,2015,10,4,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
{FF6D78C8-B660-4D58-8A9D-0FD6E32E2A03},77,1780,2013,191.0,1,0,1,1,1,1,1,1,0,0,12,239.77,0,0,2,1,6,0,365,150,71,2015,11,23,2015,11,23,2016,11,22,2016,4,22,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


#### Test-train split

In [44]:
def make_train_test_split(df, test_size=0.1, random_state=0):
    X = df.drop("late_count", axis=1).values
    y = df["late_count"].values
    
    print("X.shape: {}".format(X.shape))
    print("y.shape: {}".format(y.shape))
    print()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    print("X_train.shape: {}".format(X_train.shape))
    print("X_test.shape: {}".format(X_test.shape))
    print("y_train.shape: {}".format(y_train.shape))
    print("y_test.shape: {}".format(y_test.shape))
    
    return X_train, X_test, y_train, y_test

In [45]:
X_train, X_test, y_train, y_test = make_train_test_split(df)

X.shape: (68579, 172)
y.shape: (68579,)

X_train.shape: (61721, 172)
X_test.shape: (6858, 172)
y_train.shape: (61721,)
y_test.shape: (6858,)


#### Learning itself

In [46]:
sc = StandardScaler()

In [47]:
sc.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [48]:
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [49]:
lr = LogisticRegression(C=1000.0, random_state=0)

In [50]:
lr.fit(X_train_std, y_train)

LogisticRegression(C=1000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [51]:
def sum_values_by_prefix(d, prefix):
    total = sum([v for k, v in d.items() if k.startswith(prefix + "_")])
    clean_d = {k: v for k, v in d.items() if not k.startswith(prefix + "_")}
    clean_d[prefix] = total
    return clean_d


def print_feature_importances(feature_names, feature_weights, prefices=()):
    feature_importances = dict(zip(feature_names, feature_weights))
    
    for prefix in prefices:
            feature_importances = sum_values_by_prefix(feature_importances, prefix=prefix)

    sorted_fe = sorted(feature_importances.items(), key=operator.itemgetter(1), reverse=True)

    print("Feature importances:")

    for i, item in enumerate(sorted_fe):
        name, value = item
        print("{}\t{:.4f}\t {}".format(i + 1, value, name))

In [52]:
feature_names = df.columns.values.tolist()[:-1]
feature_weights = np.abs(np.std(X_train_std, 0) * lr.coef_)[0]

feature_weights = feature_weights / np.sum(feature_weights)

In [53]:
print_feature_importances(feature_names, feature_weights, prefices=[
    "VehicleType", 
    "VehicleUsage",
    "Gender",
    "Region",
    "BMClassMOD",
    "Channel",
    "FuelType",
    "DriveTrain",
    "sales_type",
    "Mark"
])

Feature importances:
1	0.5137	 Mark
2	0.1243	 PolicyEndDate_y
3	0.0693	 PolicyIssueDate_y
4	0.0693	 PolicyStartDate_y
5	0.0314	 PolicyEndDate_m
6	0.0307	 VehicleUsage
7	0.0202	 FuelType
8	0.0178	 PolicyIssueDate_m
9	0.0178	 PolicyStartDate_m
10	0.0177	 PolicyActualEndDate_y
11	0.0066	 VehicleType
12	0.0056	 Nb_of_payments
13	0.0050	 Policy_actual_lifetime_days
14	0.0048	 Policy_lifetime_days
15	0.0045	 Channel
16	0.0041	 PolicyActualEndDate_m
17	0.0041	 Accident
18	0.0040	 Terminated
19	0.0037	 Fire
20	0.0037	 BMClassMOD
21	0.0034	 Region
22	0.0033	 PolicyEndDate_d
23	0.0033	 payments_total
24	0.0027	 VehicleFirstRegistrationYear
25	0.0027	 Natural_disasters
26	0.0027	 Total_loss
27	0.0027	 Vandalism
28	0.0026	 Leasing
29	0.0023	 sales_type
30	0.0020	 Power
31	0.0019	 Client_age_on_issue
32	0.0019	 RenewalIndicator
33	0.0019	 DriveTrain
34	0.0014	 Road_assistance
35	0.0014	 IsRenewed
36	0.0014	 BalticRating
37	0.0010	 PolicyActualEndDate_d
38	0.0008	 PolicyIssueDate_d
39	0.0008	 Policy

In [54]:
def print_prediction_example(y_test, y_probs, y_pred, n_ex = 20):
    
    pred_ex_df = pd.DataFrame(data=list(zip(
        y_test[:n_ex],
        y_pred[:n_ex],
        y_probs[:n_ex]
    )), columns=["Test", "Pred", "Prob"])
    
    print("Prediction example:")
    print(pred_ex_df)

In [55]:
class_probs = lr.predict_proba(X_test_std)

In [56]:
class_probs

array([[0.59361662, 0.40638338],
       [0.71082744, 0.28917256],
       [0.51466713, 0.48533287],
       ...,
       [0.73009265, 0.26990735],
       [0.42104567, 0.57895433],
       [0.79818048, 0.20181952]])

In [57]:
# probabilities of positive class
y_probs = class_probs[:, 1]

# predicted class
y_pred = lr.predict(X_test)

In [58]:
print_prediction_example(y_test, y_probs, y_pred)

Prediction example:
    Test  Pred      Prob
0      0     0  0.406383
1      1     0  0.289173
2      0     0  0.485333
3      1     0  0.745900
4      0     0  0.554429
5      0     0  0.770274
6      0     0  0.495248
7      1     0  0.403677
8      0     0  0.132083
9      0     0  0.521494
10     1     0  0.273508
11     0     0  0.218920
12     1     0  0.148715
13     1     0  0.450918
14     1     0  0.344617
15     1     0  0.596601
16     0     0  0.311817
17     0     0  0.077022
18     1     0  0.472923
19     0     0  0.325888


#### Performance measures

In [59]:
def print_performance_report(y_test, y_pred):
    
    cm = confusion_matrix(y_test, y_pred)
    
    print("Confusion matrix:\n{}\n".format(cm))
    
    TN, FP, FN, TP = cm.ravel()
    
    print("TN: {}".format(TN))
    print("FP: {}".format(FP))
    print("FN: {}".format(FN))
    print("TP: {}".format(TP))
    
    # kindly explained at 
    # https://stackoverflow.com/a/43331484

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP / (TP + FN)

    # Specificity or true negative rate
    TNR = TN / (TN + FP)

    # Precision or positive predictive value
    PPV = TP / (TP + FP)

    # Negative predictive value
    NPV = TN / (TN + FN)

    # Fall out or false positive rate
    FPR = FP / (FP + TN)

    # False negative rate
    FNR = FN / (TP + FN)

    # False discovery rate
    FDR = FP / (TP + FP)

    # Overall accuracy
    ACC = (TP + TN) / (TP + FP + FN + TN)
    
    performance_df = pd.DataFrame(
        data=[
            ("Sensitivity, hit rate, recall, or true positive rate", TPR),
            ("Specificity or true negative rate", TNR),
            ("Precision or positive predictive value", PPV),
            ("Negative predictive value", NPV),
            ("Fall out or false positive rate", FPR),
            ("False negative rate", FNR),
            ("False discovery rate", FDR),
            ("Overall accuracy", ACC)
        ], columns=["Measure", "Value"]
    )

    performance_df.set_index("Measure", inplace=True)
    performance_df["Value"] = performance_df["Value"].round(3)

    print("\nPerformance measures:\n{}\n".format(performance_df))
    
    cr = classification_report(y_test, y_pred)
    print("Scikit-learn classification report:\n{}".format(cr))

In [60]:
print_performance_report(y_test, y_pred)

Confusion matrix:
[[4361    4]
 [2489    4]]

TN: 4361
FP: 4
FN: 2489
TP: 4

Performance measures:
                                                      Value
Measure                                                    
Sensitivity, hit rate, recall, or true positive rate  0.002
Specificity or true negative rate                     0.999
Precision or positive predictive value                0.500
Negative predictive value                             0.637
Fall out or false positive rate                       0.001
False negative rate                                   0.998
False discovery rate                                  0.500
Overall accuracy                                      0.636

Scikit-learn classification report:
             precision    recall  f1-score   support

          0       0.64      1.00      0.78      4365
          1       0.50      0.00      0.00      2493

avg / total       0.59      0.64      0.50      6858



### Training. Approach 2. Random forest

In [61]:
df = backup_df

#### Last specific preprocessing

In [62]:
df["Mark_Model"] = df.apply(lambda item: "{}_{}".format(item["Mark"], item["Model"]), axis=1)

print(df["Mark"].nunique())
print(df["Model"].nunique())
print(df["Mark_Model"].nunique())

69
656
698


In [63]:
# thus merging Mark & Model may reduce variable number without information loss 
df.drop(labels=["Mark", "Model"], axis=1, inplace=True)

In [64]:
df = add_dummies(df, column="Mark_Model", prefix="Mark_Model", drop_first=True)

In [65]:
df = set_target_last(df, target="late_count")

In [66]:
print("dataset dimensions now: {}".format(df.shape))

dataset dimensions now: (68579, 802)


In [67]:
df.head()

Unnamed: 0_level_0,Power,Weight,VehicleFirstRegistrationYear,Deductible_general,Fire,Theft,Natural_disasters,Road_assistance,Total_loss,Vandalism,Glass,Accident,Replacement_car,Leasing,Nb_of_payments,Premium,RenewalIndicator,IsRenewed,BalticRating,Terminated,payments_total,Policy_pre_lifetime_days,Policy_lifetime_days,Policy_actual_lifetime_days,Client_age_on_issue,PolicyIssueDate_y,PolicyIssueDate_m,PolicyIssueDate_d,PolicyStartDate_y,PolicyStartDate_m,PolicyStartDate_d,PolicyEndDate_y,PolicyEndDate_m,PolicyEndDate_d,PolicyActualEndDate_y,PolicyActualEndDate_m,PolicyActualEndDate_d,VehicleType_Lorry,VehicleType_Motorcycle,VehicleType_Passenger car,VehicleType_Tractor,VehicleType_Truck,VehicleType_Van,VehicleUsage_Agricultural Machinery,VehicleUsage_Dangerous Cargo,VehicleUsage_Forest Machinery,VehicleUsage_Operative Transport,VehicleUsage_Other,VehicleUsage_Regular,VehicleUsage_Remote Line,VehicleUsage_Short-term rental,VehicleUsage_Taxi,Gender_M,Region_HIIUMAA,Region_IDA-VIRUMAA,Region_JÄRVAMAA,Region_JÕGEVAMAA,Region_LÄÄNE-VIRUMAA,Region_LÄÄNEMAA,Region_Missing,Region_PÄRNUMAA,Region_PÕLVAMAA,Region_RAPLAMAA,Region_SAAREMAA,Region_TARTUMAA,Region_VALGAMAA,Region_VILJANDIMAA,Region_VÕRUMAA,BMClassMOD_A1,BMClassMOD_A10,BMClassMOD_A2,BMClassMOD_A3,BMClassMOD_A4,BMClassMOD_A5,BMClassMOD_A6,BMClassMOD_A7,BMClassMOD_A8,BMClassMOD_A9,BMClassMOD_P1,BMClassMOD_P10,BMClassMOD_P2,BMClassMOD_P3,BMClassMOD_P4,BMClassMOD_P5,BMClassMOD_P6,BMClassMOD_P7,BMClassMOD_P8,BMClassMOD_P9,Channel_Direct,Channel_Internet,Channel_Lean Operator,Channel_Partner,Channel_Unknown,FuelType_Diesel,FuelType_Diesel-Hybrid,FuelType_Electricity,FuelType_Gasoline,FuelType_Gasoline-Hybrid,FuelType_Missing,DriveTrain_Front,...,Mark_Model_SUBARU_LEGACY,Mark_Model_SUBARU_LEVORG,Mark_Model_SUBARU_OTHER,Mark_Model_SUBARU_OUTBACK,Mark_Model_SUBARU_TRIBECA,Mark_Model_SUBARU_XV,Mark_Model_SUZUKI_ALTO,Mark_Model_SUZUKI_BALENO,Mark_Model_SUZUKI_GRAND VITARA,Mark_Model_SUZUKI_IGNIS,Mark_Model_SUZUKI_JIMNY,Mark_Model_SUZUKI_LIANA,Mark_Model_SUZUKI_SPLASH,Mark_Model_SUZUKI_SWIFT,Mark_Model_SUZUKI_SX4,Mark_Model_SUZUKI_VITARA,Mark_Model_SUZUKI_VL 1500,Mark_Model_SUZUKI_VZR 1800,Mark_Model_SUZUKI_WAGON R+,Mark_Model_TEMSA_OTHER,Mark_Model_TOYOTA_AURIS,Mark_Model_TOYOTA_AVENSIS,Mark_Model_TOYOTA_AYGO,Mark_Model_TOYOTA_C-HR,Mark_Model_TOYOTA_CAMRY,Mark_Model_TOYOTA_CARINA,Mark_Model_TOYOTA_CELICA,Mark_Model_TOYOTA_COROLLA,Mark_Model_TOYOTA_COROLLA VERSO,Mark_Model_TOYOTA_GT86,Mark_Model_TOYOTA_HIACE,Mark_Model_TOYOTA_HILUX,Mark_Model_TOYOTA_IQ,Mark_Model_TOYOTA_LAND CRUISER,Mark_Model_TOYOTA_MR2,Mark_Model_TOYOTA_OTHER,Mark_Model_TOYOTA_PREVIA,Mark_Model_TOYOTA_PRIUS,Mark_Model_TOYOTA_PROACE,Mark_Model_TOYOTA_RAV-4,Mark_Model_TOYOTA_STARLET,Mark_Model_TOYOTA_URBAN CRUISER,Mark_Model_TOYOTA_VERSO,Mark_Model_TOYOTA_YARIS,Mark_Model_TOYOTA_YARIS VERSO,Mark_Model_VALTRA_T151E,Mark_Model_VALTRA_T202 VERSU,Mark_Model_VAZ_111,Mark_Model_VOLKSWAGEN_AMAROK,Mark_Model_VOLKSWAGEN_ARTEON,Mark_Model_VOLKSWAGEN_BEETLE,Mark_Model_VOLKSWAGEN_BORA,Mark_Model_VOLKSWAGEN_CADDY,Mark_Model_VOLKSWAGEN_CARAVELLE,Mark_Model_VOLKSWAGEN_CC,Mark_Model_VOLKSWAGEN_CRAFTER,Mark_Model_VOLKSWAGEN_EOS,Mark_Model_VOLKSWAGEN_GOLF,Mark_Model_VOLKSWAGEN_JETTA,Mark_Model_VOLKSWAGEN_KOMBI,Mark_Model_VOLKSWAGEN_LT35,Mark_Model_VOLKSWAGEN_LT46,Mark_Model_VOLKSWAGEN_MULTIVAN,Mark_Model_VOLKSWAGEN_OTHER,Mark_Model_VOLKSWAGEN_PASSAT,Mark_Model_VOLKSWAGEN_PHAETON,Mark_Model_VOLKSWAGEN_POLO,Mark_Model_VOLKSWAGEN_SCIROCCO,Mark_Model_VOLKSWAGEN_SHARAN,Mark_Model_VOLKSWAGEN_T-ROC,Mark_Model_VOLKSWAGEN_TIGUAN,Mark_Model_VOLKSWAGEN_TOUAREG,Mark_Model_VOLKSWAGEN_TOURAN,Mark_Model_VOLKSWAGEN_TRANSPORTER,Mark_Model_VOLKSWAGEN_UP!,Mark_Model_VOLVO_9700,Mark_Model_VOLVO_C30,Mark_Model_VOLVO_C70,Mark_Model_VOLVO_FH,Mark_Model_VOLVO_FH-440,Mark_Model_VOLVO_FH-480,Mark_Model_VOLVO_FH-480 6X2,Mark_Model_VOLVO_FL,Mark_Model_VOLVO_FL612,Mark_Model_VOLVO_FM,Mark_Model_VOLVO_FM12,Mark_Model_VOLVO_S40,Mark_Model_VOLVO_S60,Mark_Model_VOLVO_S80,Mark_Model_VOLVO_V40,Mark_Model_VOLVO_V50,Mark_Model_VOLVO_V60,Mark_Model_VOLVO_V70,Mark_Model_VOLVO_XC60,Mark_Model_VOLVO_XC70,Mark_Model_VOLVO_XC90,Mark_Model_YAMAHA_FZ6-S,Mark_Model_YAMAHA_XJ6,Mark_Model_YAMAHA_YZF-R1,late_count
policy_guid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1
{E4FD3CD8-4141-4E55-A74C-C983ED7EFD8E},108,1895,2007,191.0,1,1,1,0,1,1,0,1,0,0,12,261.09,0,0,3,1,9,0,365,254,31,2015,9,13,2015,9,13,2016,9,12,2016,5,24,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6
{CD63AD22-81C2-4A86-8E56-AF1084BE49E3},120,2050,2008,191.0,1,1,1,0,1,1,0,1,0,0,12,297.22,0,1,0,1,2,0,365,29,56,2015,9,23,2015,9,23,2016,9,22,2015,10,22,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
{480DDB89-BA11-4219-A92C-330ABC6BE1EC},125,1970,2012,191.0,1,1,1,1,1,1,1,1,0,1,4,534.22,0,0,0,0,4,0,365,365,48,2015,9,14,2015,9,14,2016,9,13,2016,9,13,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
{74A1885E-CC4D-435B-B7CF-CAD0287FA814},85,2510,2007,191.0,1,1,1,1,1,1,1,1,0,0,12,275.24,0,1,5,1,2,0,365,14,61,2015,9,20,2015,9,20,2016,9,19,2015,10,4,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
{FF6D78C8-B660-4D58-8A9D-0FD6E32E2A03},77,1780,2013,191.0,1,0,1,1,1,1,1,1,0,0,12,239.77,0,0,2,1,6,0,365,150,71,2015,11,23,2015,11,23,2016,11,22,2016,4,22,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Test-train split

In [68]:
X_train, X_test, y_train, y_test = make_train_test_split(df)

X.shape: (68579, 801)
y.shape: (68579,)

X_train.shape: (61721, 801)
X_test.shape: (6858, 801)
y_train.shape: (61721,)
y_test.shape: (6858,)


#### Learning itself

In [69]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)

In [70]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [71]:
feature_names = df.columns.values.tolist()[:-1]
feature_weights = rf.feature_importances_

print_feature_importances(feature_names, feature_weights, prefices=[
    "VehicleType", 
    "VehicleUsage",
    "Gender",
    "Region",
    "BMClassMOD",
    "Channel",
    "FuelType",
    "DriveTrain",
    "sales_type",
    "Mark_Model"
])

Feature importances:
1	0.1436	 Mark_Model
2	0.0550	 Region
3	0.0477	 Premium
4	0.0468	 payments_total
5	0.0452	 Client_age_on_issue
6	0.0439	 Weight
7	0.0401	 Power
8	0.0372	 VehicleFirstRegistrationYear
9	0.0336	 PolicyActualEndDate_d
10	0.0331	 Nb_of_payments
11	0.0329	 PolicyEndDate_d
12	0.0327	 PolicyIssueDate_d
13	0.0326	 PolicyStartDate_d
14	0.0283	 BMClassMOD
15	0.0266	 BalticRating
16	0.0243	 PolicyActualEndDate_m
17	0.0229	 PolicyEndDate_m
18	0.0229	 PolicyStartDate_m
19	0.0228	 PolicyIssueDate_m
20	0.0215	 Channel
21	0.0212	 DriveTrain
22	0.0208	 Policy_actual_lifetime_days
23	0.0189	 FuelType
24	0.0129	 Gender
25	0.0125	 sales_type
26	0.0121	 PolicyActualEndDate_y
27	0.0114	 Policy_lifetime_days
28	0.0109	 RenewalIndicator
29	0.0105	 Leasing
30	0.0103	 PolicyIssueDate_y
31	0.0101	 PolicyStartDate_y
32	0.0100	 PolicyEndDate_y
33	0.0088	 IsRenewed
34	0.0061	 Terminated
35	0.0044	 Deductible_general
36	0.0039	 VehicleType
37	0.0036	 Theft
38	0.0025	 VehicleUsage
39	0.0022	 Glas

In [72]:
class_probs = rf.predict_proba(X_test)

In [73]:
class_probs

array([[0.68, 0.17, 0.05, ..., 0.  , 0.  , 0.  ],
       [0.59, 0.16, 0.18, ..., 0.  , 0.  , 0.  ],
       [0.6 , 0.15, 0.11, ..., 0.  , 0.  , 0.  ],
       ...,
       [0.61, 0.21, 0.06, ..., 0.  , 0.  , 0.  ],
       [0.62, 0.14, 0.1 , ..., 0.  , 0.  , 0.  ],
       [0.85, 0.1 , 0.01, ..., 0.01, 0.  , 0.  ]])

In [74]:
# probabilities of positive class
y_probs = 1 - class_probs[:, 0]

# predicted class
y_pred = rf.predict(X_test)

In [75]:
y_test_bin = y_test.astype(bool).astype(int)
y_pred_bin = y_pred.astype(bool).astype(int)

In [76]:
print_prediction_example(y_test_bin, y_probs, y_pred_bin)

Prediction example:
    Test  Pred  Prob
0      0     0  0.32
1      1     0  0.41
2      0     0  0.40
3      1     1  0.72
4      0     0  0.54
5      0     0  0.70
6      0     0  0.51
7      1     0  0.36
8      0     0  0.13
9      0     0  0.41
10     1     0  0.37
11     0     0  0.30
12     1     0  0.14
13     1     0  0.37
14     1     0  0.42
15     1     0  0.49
16     0     0  0.37
17     0     0  0.17
18     1     0  0.45
19     0     0  0.48


#### Performance measures

In [77]:
print_performance_report(y_test_bin, y_pred_bin)

Confusion matrix:
[[4283   82]
 [2247  246]]

TN: 4283
FP: 82
FN: 2247
TP: 246

Performance measures:
                                                      Value
Measure                                                    
Sensitivity, hit rate, recall, or true positive rate  0.099
Specificity or true negative rate                     0.981
Precision or positive predictive value                0.750
Negative predictive value                             0.656
Fall out or false positive rate                       0.019
False negative rate                                   0.901
False discovery rate                                  0.250
Overall accuracy                                      0.660

Scikit-learn classification report:
             precision    recall  f1-score   support

          0       0.66      0.98      0.79      4365
          1       0.75      0.10      0.17      2493

avg / total       0.69      0.66      0.56      6858



### Conclusion

In [78]:
# True positive rate is one of key performance measures here and both models
# failed at it, though RandomForest was a bit better.

# Let's recall top-5 significant features:

# LogisticRegression:
# 1 0.5137 Mark
# 2 0.1243 PolicyEndDate_y
# 3 0.0693 PolicyIssueDate_y
# 4 0.0693 PolicyStartDate_y
# 5 0.0314 PolicyEndDate_m

# RandomForest
# 1 0.1436 Mark_Model
# 2 0.0550 Region
# 3 0.0477 Premium
# 4 0.0468 payments_total
# 5 0.0452 Client_age_on_issue

# Car Mark seems to be the most significatn feature in both cases. Probably 
# policyholder's income influents both his ability to buy cars of certain
# Marks set and probability to miss payments.

# Regarding model usage at insurance business: as told in task definition, 
# "For financial companies, including insurance providers, it is crucial 
# to track their cash flows in order to meet liabilities".

# That's all, folks.
# i.