In [3]:
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('scikit-learn version:', sk.__version__)
print('matplotlib version:', matplotlib.__version__)

%matplotlib inline

numpy version: 1.12.1
pandas version: 0.20.1
scikit-learn version: 0.18.1
matplotlib version: 2.0.2


In [4]:
datafolder = 'data/'
filename = 'dataset_informa.csv'
df = pd.read_csv(datafolder + filename, sep=',', low_memory=False, encoding = 'ISO-8859-1')
print(df.shape)
df.head()

(190571, 49)


Unnamed: 0,AGENCY_ID,PRIMARY_AGENCY_ID,PROD_ABBR,PROD_LINE,STATE_ABBR,STAT_PROFILE_DATE_YEAR,RETENTION_POLY_QTY,POLY_INFORCE_QTY,PREV_POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,...,PL_BOUND_CT_ELINKS,PL_QUO_CT_ELINKS,PL_BOUND_CT_PLRANK,PL_QUO_CT_PLRANK,PL_BOUND_CT_eQTte,PL_QUO_CT_eQTte,PL_BOUND_CT_APPLIED,PL_QUO_CT_APPLIED,PL_BOUND_CT_TRANSACTNOW,PL_QUO_CT_TRANSACTNOW
0,3.0,3,BOILERMACH,CL,IN,2005.0,0.0,0.0,0.0,40.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
1,3.0,3,BOILERMACH,CL,IN,2006.0,0.0,0.0,0.0,151.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
2,3.0,3,BOILERMACH,CL,IN,2007.0,0.0,0.0,0.0,40.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
3,3.0,3,BOILERMACH,CL,IN,2008.0,0.0,0.0,0.0,69.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
4,3.0,3,BOILERMACH,CL,IN,2009.0,0.0,0.0,0.0,28.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0


In [5]:
# Primary_Agency_ID: When this column data is missing/99999 we are going to replace it with the Agency_ID. 
# We can not find any way to associate a primary agency with the agency in question so will replace it with its ID.
df['PRIMARY_AGENCY_ID'] = np.where(df['PRIMARY_AGENCY_ID'] == 0, df['PRIMARY_AGENCY_ID'], df['AGENCY_ID'])

# Prev_Poly_Inforce_QTY: Contains 99999 values. 
# These values mean there are no previous years or this year did not have data (CL).
# These values will be replaces by 0
df['PREV_POLY_INFORCE_QTY'] = df['PREV_POLY_INFORCE_QTY'].replace(9999,0)

df

Unnamed: 0,AGENCY_ID,PRIMARY_AGENCY_ID,PROD_ABBR,PROD_LINE,STATE_ABBR,STAT_PROFILE_DATE_YEAR,RETENTION_POLY_QTY,POLY_INFORCE_QTY,PREV_POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,...,PL_BOUND_CT_ELINKS,PL_QUO_CT_ELINKS,PL_BOUND_CT_PLRANK,PL_QUO_CT_PLRANK,PL_BOUND_CT_eQTte,PL_QUO_CT_eQTte,PL_BOUND_CT_APPLIED,PL_QUO_CT_APPLIED,PL_BOUND_CT_TRANSACTNOW,PL_QUO_CT_TRANSACTNOW
0,3.0,3.0,BOILERMACH,CL,IN,2005.0,0.0,0.0,0.0,40.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
1,3.0,3.0,BOILERMACH,CL,IN,2006.0,0.0,0.0,0.0,151.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
2,3.0,3.0,BOILERMACH,CL,IN,2007.0,0.0,0.0,0.0,40.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
3,3.0,3.0,BOILERMACH,CL,IN,2008.0,0.0,0.0,0.0,69.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
4,3.0,3.0,BOILERMACH,CL,IN,2009.0,0.0,0.0,0.0,28.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
5,3.0,3.0,BOILERMACH,CL,IN,2010.0,0.0,0.0,0.0,120.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
6,3.0,3.0,BOILERMACH,CL,IN,2011.0,0.0,0.0,0.0,231.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
7,3.0,3.0,BOILERMACH,CL,IN,2012.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
8,3.0,3.0,BOILERMACH,CL,IN,2013.0,0.0,0.0,0.0,111.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
9,3.0,3.0,BOILERMACH,CL,IN,2014.0,0.0,0.0,0.0,213.00,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0


In [8]:
#Get incomplete rows in the dataframe
df_incomplete = df[(df["MONTHS"] != 12)]
df_incomplete[recal].head(5)

Unnamed: 0,POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,WRTN_PREM_AMT,PRD_ERND_PREM_AMT,PRD_INCRD_LOSSES_AMT
0,0.0,40.0,3074.63,3005.77,0.0
10,0.0,0.0,1922.83,1534.44,0.0
11,0.0,0.0,361.68,216.2,0.0
21,0.0,0.0,0.0,530.27,0.0
22,0.0,0.0,13607.47,17667.42,16667.51


In [31]:
#Get a list of all values needing to be recalculated per item
recal = ["POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT", "PRD_INCRD_LOSSES_AMT"]
df_filtered = df_incomplete
for index, row in df_filtered.iterrows():
    #Amount of months
    months = row["MONTHS"]
    for column in recal:
        newvalue = row[column] * (12 - months) #Value times amount of months missing
        df_filtered.set_value(index, column, newvalue) #Set new row in dataset
    row["MONTHS"] = 12 #Reset the months to be 12
df_filtered[recal].head(5)

Unnamed: 0,POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,WRTN_PREM_AMT,PRD_ERND_PREM_AMT,PRD_INCRD_LOSSES_AMT
0,0.0,640.0,49194.08,48092.32,0.0
10,0.0,0.0,94218.67,75187.56,0.0
11,0.0,0.0,5786.88,3459.2,0.0
21,0.0,0.0,0.0,25983.23,0.0
22,0.0,0.0,217719.52,282678.72,266680.16


In [15]:
df_old = df_filtered #Copy of the dataset for testing

In [32]:
#RETENTION_RATIO = RETENTION_POLY_QTY / PREV_POLY_INFORCE_QTY
#LOSS_RATIO = PRD_INCRD_LOSSES_AMT / WRTN_PREM_AMT

for index, row in df_filtered.iterrows():
    if (row["WRTN_PREM_AMT"] > 0) and (row["PRD_INCRD_LOSSES_AMT"] > 0):
        df_filtered.set_value(index, "LOSS_RATIO", row["PRD_INCRD_LOSSES_AMT"] / row["WRTN_PREM_AMT"])
    else:
        df_filtered.set_value(index, "LOSS_RATIO", 0)
    if (row["PREV_POLY_INFORCE_QTY"] > 0) and (row["RETENTION_POLY_QTY"] > 0):
        df_filtered.set_value(index, "RETENTION_RATIO", row["RETENTION_POLY_QTY"] / row["PREV_POLY_INFORCE_QTY"])
    else:
        df_filtered.set_value(index, "RETENTION_RATIO",0)

In [29]:
df_old[["LOSS_RATIO", "RETENTION_RATIO"]]

Unnamed: 0,LOSS_RATIO,RETENTION_RATIO
0,0.000000,99999.000000
10,0.000000,99999.000000
11,0.000000,99999.000000
21,99999.000000,99999.000000
22,1.224879,99999.000000
32,0.030583,99999.000000
33,0.000000,99999.000000
43,0.000000,99999.000000
44,0.717888,99999.000000
54,0.273193,99999.000000


In [34]:
df_filtered[["PRD_INCRD_LOSSES_AMT","WRTN_PREM_AMT","LOSS_RATIO"]]

Unnamed: 0,PRD_INCRD_LOSSES_AMT,WRTN_PREM_AMT,LOSS_RATIO
0,0.00,49194.08,0.000000
10,0.00,94218.67,0.000000
11,0.00,5786.88,0.000000
21,0.00,0.00,0.000000
22,266680.16,217719.52,1.224879
32,24117.80,788606.00,0.030583
33,0.00,28750.56,0.000000
43,0.00,18395.58,0.000000
44,1170601.12,1630618.72,0.717888
54,878932.11,3217257.19,0.273193


In [35]:
df_filtered[["RETENTION_POLY_QTY","PREV_POLY_INFORCE_QTY", "RETENTION_RATIO"]]

Unnamed: 0,RETENTION_POLY_QTY,PREV_POLY_INFORCE_QTY,RETENTION_RATIO
0,0.0,0.0,0.000000
10,0.0,0.0,0.000000
11,0.0,0.0,0.000000
21,0.0,0.0,0.000000
22,0.0,0.0,0.000000
32,0.0,0.0,0.000000
33,0.0,0.0,0.000000
43,0.0,0.0,0.000000
44,0.0,0.0,0.000000
54,0.0,0.0,0.000000


In [3]:
df['VENDORID'] = df['VENDOR'].map({'Unknown': 0, 'A': 1, 'B':2, 'C' : 3, 'E':4})

In [6]:
df.head(50)

Unnamed: 0,AGENCY_ID,PRIMARY_AGENCY_ID,PROD_ABBR,PROD_LINE,STATE_ABBR,STAT_PROFILE_DATE_YEAR,RETENTION_POLY_QTY,POLY_INFORCE_QTY,PREV_POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,...,PL_QUO_CT_ELINKS,PL_BOUND_CT_PLRANK,PL_QUO_CT_PLRANK,PL_BOUND_CT_eQTte,PL_QUO_CT_eQTte,PL_BOUND_CT_APPLIED,PL_QUO_CT_APPLIED,PL_BOUND_CT_TRANSACTNOW,PL_QUO_CT_TRANSACTNOW,VENDORID
0,3.0,3,BOILERMACH,CL,IN,2005.0,0.0,0.0,0.0,40.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
1,3.0,3,BOILERMACH,CL,IN,2006.0,0.0,0.0,0.0,151.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
2,3.0,3,BOILERMACH,CL,IN,2007.0,0.0,0.0,0.0,40.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
3,3.0,3,BOILERMACH,CL,IN,2008.0,0.0,0.0,0.0,69.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
4,3.0,3,BOILERMACH,CL,IN,2009.0,0.0,0.0,0.0,28.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
5,3.0,3,BOILERMACH,CL,IN,2010.0,0.0,0.0,0.0,120.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
6,3.0,3,BOILERMACH,CL,IN,2011.0,0.0,0.0,0.0,231.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
7,3.0,3,BOILERMACH,CL,IN,2012.0,0.0,0.0,0.0,0.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
8,3.0,3,BOILERMACH,CL,IN,2013.0,0.0,0.0,0.0,111.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
9,3.0,3,BOILERMACH,CL,IN,2014.0,0.0,0.0,0.0,213.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
