### INSURANCE POLICY LAPSE PREDICTION

### Import Packages

In [26]:
#A00
#import basic packages.
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

### Import Files

In [27]:
#A00
#import files
client_df = pd.read_csv('Practice/data/client_data.csv')
payment_df = pd.read_csv('Practice/data/payment_history.csv')
policy_df = pd.read_csv('Practice/data/policy_data.csv')
main_train_df = pd.read_csv('Practice/data/train.csv')
smp_sub_df = pd.read_csv('Practice/data/sample_sub.csv')

### Expolaratory Data Analysis

In [3]:
#EDA on main_train_df

main_train_df.head(3)

Unnamed: 0,Policy ID,Lapse,Lapse Year
0,PID_4928TWH,?,?
1,PID_KBLLEGK,?,?
2,PID_90F0QA3,?,?


The Policy ID with Lapse and Lapse Year '?' values are meant to be in the test df and will be separated

In [4]:
main_train_df.shape

(51685, 3)

The train df has 51,683 rows and 3 columns

In [5]:
main_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51685 entries, 0 to 51684
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Policy ID   51685 non-null  object
 1   Lapse       51685 non-null  object
 2   Lapse Year  51685 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [9]:
main_train_df.isna().sum()

Policy ID     0
Lapse         0
Lapse Year    0
dtype: int64

No null values

In [10]:
len(main_train_df['Policy ID'].unique())

51685

There are no duplicate Policy ID values

In [11]:
main_train_df['Lapse'].unique()

array(['?', '1'], dtype=object)

Apart from '?', the Lapse value is equal to 1

In [12]:
main_train_df['Lapse Year'].unique()

array(['?', '2019', '2018', '2017'], dtype=object)

The Lapse Years are between (2017-2019)

### Feature Engineering

First, we create the test df. Our target variable is Lapse, therefore we drop the Lapse Year column

In [68]:
#A00
test_df = main_train_df[main_train_df['Lapse'] == '?'].drop(['Lapse Year'], axis =1)
test_df.head(3)

Unnamed: 0,Policy ID,Lapse
0,PID_4928TWH,?
1,PID_KBLLEGK,?
2,PID_90F0QA3,?


In [14]:
test_df['Lapse'].unique()

array(['?'], dtype=object)

In [15]:
test_df.shape

(43707, 2)

Out of the 51,685 rows, 43,707 are in our test df

We create the train_df without the test rows

In [69]:
#A00
train_df = main_train_df[main_train_df['Lapse'] != '?']
print(train_df.shape)
train_df.head(3)

(7978, 3)


Unnamed: 0,Policy ID,Lapse,Lapse Year
16,PID_MFAAYNJ,1,2019
23,PID_TICDPAY,1,2019
30,PID_SPACC3N,1,2018


In [33]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7978 entries, 16 to 51680
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Policy ID   7978 non-null   object
 1   Lapse       7978 non-null   object
 2   Lapse Year  7978 non-null   object
dtypes: object(3)
memory usage: 249.3+ KB


Our train df has 7,978 columns. The Lapse and Lapse Year are stored as objects. We will convert them to integers

In [70]:
#A00
#Converting Lapse & Lapse Year to integers
train_df.loc[:, ('Lapse Year')] = pd.to_numeric(train_df['Lapse Year'])
train_df.loc[:, ('Lapse')] = pd.to_numeric(train_df['Lapse'])
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7978 entries, 16 to 51680
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Policy ID   7978 non-null   object
 1   Lapse       7978 non-null   int64 
 2   Lapse Year  7978 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 249.3+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, ('Lapse Year')] = pd.to_numeric(train_df['Lapse Year'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, ('Lapse')] = pd.to_numeric(train_df['Lapse'])


Success!

In [16]:
train_df['Lapse'].unique()

array([1], dtype=int64)

In [17]:
train_df['Lapse Year'].unique()

array([2019, 2018, 2017], dtype=int64)

We combine the following main features to the train_df: <br>
POLICY_ID, LAPSE, LAPSE_YEAR - from train_df <br/>
SEX, BIRTH_DATE - from client_df <br/>
AMOUNT_PAID, DATE_PAID, PREMIUM_DUE_DATE - from payment_df <br/>
PRODUCT_CODE(PPR_PRODCD), PRINC_RSHIP(CLF_LIFECD), SUM_ASSURED(NPR_SUM_ASSURE), CATEGORY - from policy_df


In [71]:
#A00 
#Combine main columns in the datasets to the train_df
client_df_features = ['Policy ID','NPH_SEX','NPH_BIRTHDATE']
payment_df_features = ['Policy ID', 'AMOUNTPAID', 'DATEPAID', 'PREMIUMDUEDATE']
policy_df_features = ['Policy ID', 'PPR_PRODCD', 'CLF_LIFECD', 'NPR_SUMASSURED', 'CATEGORY']

#Join the Client Dataset
train_df = train_df.join(
    client_df[client_df_features].set_index('Policy ID'),
    on = 'Policy ID', how = 'inner')

#Join the Payment Dataset
train_df = train_df.join(
    payment_df[payment_df_features].set_index('Policy ID'),
    on = 'Policy ID', how = 'inner')

#Join the Policy Dataset
train_df = train_df.join(
    policy_df[policy_df_features].set_index('Policy ID'),
    on = 'Policy ID', how = 'inner')

#Rename the columns
rename_cols = {
    'Policy ID': 'POLICY_ID', 'NPH_SEX': 'GENDER',
    'NPH_BIRTHDATE': 'BIRTH YEAR', 'AMOUNTPAID': 'AMOUNT_PAID',
    'DATEPAID': 'DATE_PAID', 'PREMIUMDUEDATE': 'PREMIUM_DUE_DATE',
    'PPR_PRODCD': 'PRODUCT_CODE', 'CLF_LIFECD': 'PRINC_RSHIP',
    'NPR_SUMASSURED': 'SUM_ASSURED', 'Lapse': 'LAPSE', 'Lapse Year': 'LAPSE_YEAR'
}

train_df.rename(columns=rename_cols, inplace=True)

#Re-order the features
train_df = train_df.reindex(columns=
    ['POLICY_ID'] + [col for col in train_df.columns if col not in\
        ['POLICY_ID', 'LAPSE', 'LAPSE_YEAR']] + ['LAPSE_YEAR'] + ['LAPSE']
    )

train_df

 

Unnamed: 0,POLICY_ID,GENDER,BIRTH YEAR,AMOUNT_PAID,DATE_PAID,PREMIUM_DUE_DATE,PRODUCT_CODE,PRINC_RSHIP,SUM_ASSURED,CATEGORY,LAPSE_YEAR,LAPSE
16,PID_MFAAYNJ,F,1987,0.0,2018-01-10 00:00:00,2018-01-10 00:00:00,PPR_PRODCD_KOFUYNN,4,27631.086529,CATEGORY_GWW4FYB,2019,1
16,PID_MFAAYNJ,F,1987,0.0,2018-01-10 00:00:00,2018-01-10 00:00:00,PPR_PRODCD_KOFUYNN,1,123487.430359,CATEGORY_GWW4FYB,2019,1
16,PID_MFAAYNJ,F,1987,0.0,2018-01-10 00:00:00,2018-01-10 00:00:00,PPR_PRODCD_KOFUYNN,2,123487.430359,CATEGORY_GWW4FYB,2019,1
16,PID_MFAAYNJ,F,1987,0.0,2018-01-10 00:00:00,2018-01-10 00:00:00,PPR_PRODCD_B2KVCE7,2,,CATEGORY_GWW4FYB,2019,1
16,PID_MFAAYNJ,F,1987,0.0,2018-01-10 00:00:00,2018-01-10 00:00:00,PPR_PRODCD_B2KVCE7,3,,CATEGORY_GWW4FYB,2019,1
...,...,...,...,...,...,...,...,...,...,...,...,...
51671,PID_BS40NIJ,F,1984,0.0,2018-01-11 00:00:00,2018-01-11 00:00:00,PPR_PRODCD_KOFUYNN,2,27631.086529,CATEGORY_GWW4FYB,2019,1
51671,PID_BS40NIJ,F,1984,0.0,2018-01-11 00:00:00,2018-01-11 00:00:00,PPR_PRODCD_B2KVCE7,4,,CATEGORY_GWW4FYB,2019,1
51671,PID_BS40NIJ,F,1984,0.0,2018-01-11 00:00:00,2018-01-11 00:00:00,PPR_PRODCD_KOFUYNN,3,27631.086529,CATEGORY_GWW4FYB,2019,1
51671,PID_BS40NIJ,F,1984,0.0,2018-01-11 00:00:00,2018-01-11 00:00:00,PPR_PRODCD_KOFUYNN,1,27631.086529,CATEGORY_GWW4FYB,2019,1
