In [1]:
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('scikit-learn version:', sk.__version__)
print('matplotlib version:', matplotlib.__version__)

%matplotlib inline

numpy version: 1.12.1
pandas version: 0.20.1
scikit-learn version: 0.18.1
matplotlib version: 2.0.2


In [2]:
datafolder = 'data/'
filename = 'dataset_informa.csv'
df = pd.read_csv(datafolder + filename, sep=',', low_memory=False, encoding = 'ISO-8859-1')
print(df.shape)
df.head()

(190571, 49)


Unnamed: 0,AGENCY_ID,PRIMARY_AGENCY_ID,PROD_ABBR,PROD_LINE,STATE_ABBR,STAT_PROFILE_DATE_YEAR,RETENTION_POLY_QTY,POLY_INFORCE_QTY,PREV_POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,...,PL_BOUND_CT_ELINKS,PL_QUO_CT_ELINKS,PL_BOUND_CT_PLRANK,PL_QUO_CT_PLRANK,PL_BOUND_CT_eQTte,PL_QUO_CT_eQTte,PL_BOUND_CT_APPLIED,PL_QUO_CT_APPLIED,PL_BOUND_CT_TRANSACTNOW,PL_QUO_CT_TRANSACTNOW
0,3.0,3,BOILERMACH,CL,IN,2005.0,0.0,0.0,0.0,40.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
1,3.0,3,BOILERMACH,CL,IN,2006.0,0.0,0.0,0.0,151.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
2,3.0,3,BOILERMACH,CL,IN,2007.0,0.0,0.0,0.0,40.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
3,3.0,3,BOILERMACH,CL,IN,2008.0,0.0,0.0,0.0,69.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0
4,3.0,3,BOILERMACH,CL,IN,2009.0,0.0,0.0,0.0,28.0,...,0.0,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0


In [3]:
# Primary_Agency_ID: When this column data is missing/99999 we are going to replace it with the Agency_ID. 
# We can not find any way to associate a primary agency with the agency in question so will replace it with its ID.
df['PRIMARY_AGENCY_ID'] = np.where(df['PRIMARY_AGENCY_ID'] == 0, df['PRIMARY_AGENCY_ID'], df['AGENCY_ID'])

# Prev_Poly_Inforce_QTY: Contains 99999 values. 
# These values mean there are no previous years or this year did not have data (CL).
# These values will be replaces by 0
df['PREV_POLY_INFORCE_QTY'] = df['PREV_POLY_INFORCE_QTY'].replace(99999,0)

# Agency_appointment_year: 5000 missing data, irrelevant column, will be replaced with average.
mean = df['AGENCY_APPOINTMENT_YEAR'].mean()
df['AGENCY_APPOINTMENT_YEAR'].replace(99999, mean) 

# Active_Producers, Max_age and min_age: 
# If the amount of producers is not set (99999) this can be set to 0, this also goes for max and min age.
df['ACTIVE_PRODUCERS'].replace(99999, 0) 
df['MAX_AGE'].replace(99999, 0) 
df['MIN_AGE'].replace(99999, 0) 

# (VENDOR)_START_YEAR: Values are varied between 1994 and 2015. 
# When the value is missing (99999) the vendor was not used. 
# All 99999 values can be replaced with 0.
df['PL_START_YEAR'].replace(99999, 0)
df['CL_START_YEAR'].replace(99999, 0)

# (VENDOR)_END _YEAR: The year the agency stopped using the vendor. 
# This can either mean the vendor is never used or the vendor is still in use. 
# If the vendor has never been used we can replace the value with 0, if the vendor is still in use we can replace it with 1.
df['CL_END_YEAR'] = np.where(df['CL_START_YEAR'] == 0, 0, 1)
df['PL_END_YEAR'] = np.where(df['PL_START_YEAR'] == 0, 0, 1)


In [4]:
#Get a list of all values needing to be recalculated per item
recal = ["POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT", "PRD_INCRD_LOSSES_AMT"]
df_incomplete = df[(df["MONTHS"] != 12)]
df_filtered = df_incomplete
for index, row in df_filtered.iterrows():
    #Amount of months
    months = row["MONTHS"]
    for column in recal:
        newvalue = row[column] * (12 - months) #Value times amount of months missing
        df_filtered.set_value(index, column, newvalue) #Set new row in dataset
    row["MONTHS"] = 12 #Reset the months to be 12
df_filtered[recal].head(5)





Unnamed: 0,POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,WRTN_PREM_AMT,PRD_ERND_PREM_AMT,PRD_INCRD_LOSSES_AMT
0,0.0,160.0,12298.52,12023.08,0.0
10,0.0,0.0,13459.81,10741.08,0.0
11,0.0,0.0,1446.72,864.8,0.0
21,0.0,0.0,0.0,3711.89,0.0
22,0.0,0.0,54429.88,70669.68,66670.04


In [5]:
df_old = df_filtered #Copy of the dataset for testing
df_incomplete[recal].head(5) #Get incomplete rows in the dataframe

Unnamed: 0,POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,WRTN_PREM_AMT,PRD_ERND_PREM_AMT,PRD_INCRD_LOSSES_AMT
0,0.0,160.0,12298.52,12023.08,0.0
10,0.0,0.0,13459.81,10741.08,0.0
11,0.0,0.0,1446.72,864.8,0.0
21,0.0,0.0,0.0,3711.89,0.0
22,0.0,0.0,54429.88,70669.68,66670.04


In [6]:
#RETENTION_RATIO = RETENTION_POLY_QTY / PREV_POLY_INFORCE_QTY
#LOSS_RATIO = PRD_INCRD_LOSSES_AMT / WRTN_PREM_AMT

for index, row in df_filtered.iterrows():
    if (row["WRTN_PREM_AMT"] > 0) and (row["PRD_INCRD_LOSSES_AMT"] > 0):
        df_filtered.set_value(index, "LOSS_RATIO", row["PRD_INCRD_LOSSES_AMT"] / row["WRTN_PREM_AMT"])
    else:
        df_filtered.set_value(index, "LOSS_RATIO", 0)
    if (row["PREV_POLY_INFORCE_QTY"] > 0) and (row["RETENTION_POLY_QTY"] > 0):
        df_filtered.set_value(index, "RETENTION_RATIO", row["RETENTION_POLY_QTY"] / row["PREV_POLY_INFORCE_QTY"])
    else:
        df_filtered.set_value(index, "RETENTION_RATIO",0)

In [7]:
df_old[["LOSS_RATIO", "RETENTION_RATIO"]]

Unnamed: 0,LOSS_RATIO,RETENTION_RATIO
0,0.000000,0.000000
10,0.000000,0.000000
11,0.000000,0.000000
21,0.000000,0.000000
22,1.224879,0.000000
32,0.030583,0.000000
33,0.000000,0.000000
43,0.000000,0.000000
44,0.717888,0.000000
54,0.273193,0.000000


In [8]:
df_filtered[["PRD_INCRD_LOSSES_AMT","WRTN_PREM_AMT","LOSS_RATIO"]]

Unnamed: 0,PRD_INCRD_LOSSES_AMT,WRTN_PREM_AMT,LOSS_RATIO
0,0.00,12298.52,0.000000
10,0.00,13459.81,0.000000
11,0.00,1446.72,0.000000
21,0.00,0.00,0.000000
22,66670.04,54429.88,1.224879
32,3445.40,112658.00,0.030583
33,0.00,7187.64,0.000000
43,0.00,2627.94,0.000000
44,292650.28,407654.68,0.717888
54,125561.73,459608.17,0.273193


In [9]:
df_filtered[["RETENTION_POLY_QTY","PREV_POLY_INFORCE_QTY", "RETENTION_RATIO"]]

Unnamed: 0,RETENTION_POLY_QTY,PREV_POLY_INFORCE_QTY,RETENTION_RATIO
0,0.0,0.0,0.000000
10,0.0,0.0,0.000000
11,0.0,0.0,0.000000
21,0.0,0.0,0.000000
22,0.0,0.0,0.000000
32,0.0,0.0,0.000000
33,0.0,0.0,0.000000
43,0.0,0.0,0.000000
44,0.0,0.0,0.000000
54,0.0,0.0,0.000000


In [10]:
df['VENDORID'] = df['VENDOR'].map({'Unknown': 0, 'A': 1, 'B':2, 'C' : 3, 'E':4})

In [11]:
df.head(50)

Unnamed: 0,AGENCY_ID,PRIMARY_AGENCY_ID,PROD_ABBR,PROD_LINE,STATE_ABBR,STAT_PROFILE_DATE_YEAR,RETENTION_POLY_QTY,POLY_INFORCE_QTY,PREV_POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,...,PL_QUO_CT_ELINKS,PL_BOUND_CT_PLRANK,PL_QUO_CT_PLRANK,PL_BOUND_CT_eQTte,PL_QUO_CT_eQTte,PL_BOUND_CT_APPLIED,PL_QUO_CT_APPLIED,PL_BOUND_CT_TRANSACTNOW,PL_QUO_CT_TRANSACTNOW,VENDORID
0,3.0,3.0,BOILERMACH,CL,IN,2005.0,0.0,0.0,0.0,40.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
1,3.0,3.0,BOILERMACH,CL,IN,2006.0,0.0,0.0,0.0,151.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
2,3.0,3.0,BOILERMACH,CL,IN,2007.0,0.0,0.0,0.0,40.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
3,3.0,3.0,BOILERMACH,CL,IN,2008.0,0.0,0.0,0.0,69.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
4,3.0,3.0,BOILERMACH,CL,IN,2009.0,0.0,0.0,0.0,28.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
5,3.0,3.0,BOILERMACH,CL,IN,2010.0,0.0,0.0,0.0,120.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
6,3.0,3.0,BOILERMACH,CL,IN,2011.0,0.0,0.0,0.0,231.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
7,3.0,3.0,BOILERMACH,CL,IN,2012.0,0.0,0.0,0.0,0.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
8,3.0,3.0,BOILERMACH,CL,IN,2013.0,0.0,0.0,0.0,111.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0
9,3.0,3.0,BOILERMACH,CL,IN,2014.0,0.0,0.0,0.0,213.0,...,0.0,0.0,103.0,50.0,288.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_average = df.copy()
average = ["PREV_POLY_INFORCE_QTY","POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT"]

states = df_average.STATE_ABBR.unique()
prod_line = df_average.PROD_LINE.unique()
   
state_dict = {}    
for state in states:
    for line in prod_line:
        state_mean = df_average[(df_average["STATE_ABBR"] == state) & (df_average["PROD_LINE"] == line)].mean()
        state_dict[(state, line)] = state_mean


In [17]:
for index, row in df_average.iterrows():
    for col in average:
        if(row[col] == 0):
            try:
                newvalue = state_dict[(row["STATE_ABBR"], row["PROD_LINE"])]
            except(ValueError): #No average can be found since its all 0
                newvalue = 0 
            df_average.set_value(index, col, newvalue[col])


In [18]:
df_average[average].head(10)

Unnamed: 0,PREV_POLY_INFORCE_QTY,POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,WRTN_PREM_AMT,PRD_ERND_PREM_AMT
0,14.320342,14.131276,40.0,3074.63,3005.77
1,14.320342,14.131276,151.0,6562.28,6433.44
2,14.320342,14.131276,40.0,6205.11,6357.09
3,14.320342,14.131276,69.0,5159.89,5315.01
4,14.320342,14.131276,28.0,4527.71,4706.47
5,14.320342,14.131276,120.0,4768.49,4653.02
6,14.320342,14.131276,231.0,4988.11,4790.67
7,14.320342,14.131276,1676.076536,4905.49,5005.71
8,14.320342,14.131276,111.0,4545.51,4749.26
9,14.320342,14.131276,213.0,3662.71,3838.8


In [23]:
df.STATE_ABBR.unique()
statedict = {np.NaN : 0, "IN" : 1, "KY" : 2, "MI" : 3, "OH" : 4, "PA" : 5, "WV" : 6}
df.PROD_ABBR.unique()
proddict = {np.NaN : 0, "BOILERMACH" : 1, "BOP" : 2, "COMMAUTO" : 3, "COMMINLMAR" : 4, "COMMPOL": 5, "COMMUMBREL" : 6, "CRIME" : 7,
           "FIREALLIED" : 8, "GARAGE" : 9, "GENERALIAB" : 10, "WORKCOMP" : 11, "ANNIV" : 12, "ANNIV   12":13, "CYCLES":14,
           "DTALK":15, "DWELLFIRE":16, "HOMEONWERS":17,"MOBILEHOME":18,"MOTORHOM12":19,"MOTORHOME":20,"PERSUMBREL":21, "YACHT":22,
           "DTALK   12":23, "PERSINLMAR":24, "SNOWMOBILE":25,"CYCLES  12":26,"SNOWMOBBI12":27,"PERSAIP":28}
df.PROD_LINE.unique()
prodlinedict = {np.NaN : 0, "CL":1, "PL":2}

In [16]:
# split PL and CL datasets
df_cl = df[df.PROD_LINE=='CL']
df_pl = df[df.PROD_LINE=='PL']

# remove irrelevant columns from splits
df_cl = df_cl.filter(regex='^(?!PL_)\w+', axis=1)
df_pl = df_pl.filter(regex='^(?!CL_)\w+', axis=1)

# write dataframes to csv files
df_cl.to_csv('cleaned_CL_'+filename)
df_pl.to_csv('cleaned_PL_'+filename)