In [73]:
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('scikit-learn version:', sk.__version__)
print('matplotlib version:', matplotlib.__version__)

%matplotlib inline

numpy version: 1.12.1
pandas version: 0.20.1
scikit-learn version: 0.18.1
matplotlib version: 2.0.2


In [74]:
datafolder = 'data/'
filename = 'dataset_informa.csv'
df = pd.read_csv(datafolder + filename, sep=',', low_memory=False, encoding = 'ISO-8859-1')
print(df.shape)
df.head()

(190570, 49)


Unnamed: 0,AGENCY_ID,PRIMARY_AGENCY_ID,PROD_ABBR,PROD_LINE,STATE_ABBR,STAT_PROFILE_DATE_YEAR,RETENTION_POLY_QTY,POLY_INFORCE_QTY,PREV_POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,...,PL_BOUND_CT_ELINKS,PL_QUO_CT_ELINKS,PL_BOUND_CT_PLRANK,PL_QUO_CT_PLRANK,PL_BOUND_CT_eQTte,PL_QUO_CT_eQTte,PL_BOUND_CT_APPLIED,PL_QUO_CT_APPLIED,PL_BOUND_CT_TRANSACTNOW,PL_QUO_CT_TRANSACTNOW
0,3,3,BOILERMACH,CL,IN,2005,0,0,0,40.0,...,0,0,0,103,50,288,0,0,0,0
1,3,3,BOILERMACH,CL,IN,2006,0,0,0,151.0,...,0,0,0,103,50,288,0,0,0,0
2,3,3,BOILERMACH,CL,IN,2007,0,0,0,40.0,...,0,0,0,103,50,288,0,0,0,0
3,3,3,BOILERMACH,CL,IN,2008,0,0,0,69.0,...,0,0,0,103,50,288,0,0,0,0
4,3,3,BOILERMACH,CL,IN,2009,0,0,0,28.0,...,0,0,0,103,50,288,0,0,0,0


In [75]:
df = df.drop(['RETENTION_RATIO'], axis = 1)

# Primary_Agency_ID: When this column data is missing/99999 we are going to replace it with the Agency_ID. 
# We can not find any way to associate a primary agency with the agency in question so will replace it with its ID.
df['PRIMARY_AGENCY_ID'] = np.where(df['PRIMARY_AGENCY_ID'] == 0, df['PRIMARY_AGENCY_ID'], df['AGENCY_ID'])

# (VENDOR)_END _YEAR: The year the agency stopped using the vendor. 
# This can either mean the vendor is never used or the vendor is still in use. 
# If the vendor has never been used we can replace the value with 0, if the vendor is still in use we can replace it with 1.
df['CL_END_YEAR'] = np.where(df['CL_START_YEAR'] == 0, 0, 1)
df['PL_END_YEAR'] = np.where(df['PL_START_YEAR'] == 0, 0, 1)

# Agency_appointment_year: 5000 missing data, irrelevant column, will be replaced with average.
mean = df['AGENCY_APPOINTMENT_YEAR'].mean()
df['AGENCY_APPOINTMENT_YEAR'] = df['AGENCY_APPOINTMENT_YEAR'].replace(99999, mean, inplace=True) 

# replace the rest of the 99999 values with 0
df = df.replace(99999, 0)

In [76]:
#Get a list of all values needing to be recalculated per item
recal = ["POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT", "PRD_INCRD_LOSSES_AMT"]

for index, row in df.iterrows():
    #Amount of months
    months = row["MONTHS"]
    for column in recal:
        newvalue = row[column] * (12 - months) #Value times amount of months missing
        df.set_value(index, column, newvalue) #Set new row in dataset
    row["MONTHS"] = 12 #Reset the months to be 12
df[recal].head(5)


Unnamed: 0,POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,WRTN_PREM_AMT,PRD_ERND_PREM_AMT,PRD_INCRD_LOSSES_AMT
0,0,160.0,12298.52,12023.08,0.0
1,0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0


In [77]:
df[recal].head(5) #Get incomplete rows in the dataframe

Unnamed: 0,POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,WRTN_PREM_AMT,PRD_ERND_PREM_AMT,PRD_INCRD_LOSSES_AMT
0,0,160.0,12298.52,12023.08,0.0
1,0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0


In [78]:
#RETENTION_RATIO = RETENTION_POLY_QTY / PREV_POLY_INFORCE_QTYDataFrame.drop
#LOSS_RATIO = PRD_INCRD_LOSSES_AMT / WRTN_PREM_AMT

for index, row in df.iterrows():
    if (row["WRTN_PREM_AMT"] > 0) & (row["PRD_INCRD_LOSSES_AMT"] > 0):
        df.set_value(index, "LOSS_RATIO", row["PRD_INCRD_LOSSES_AMT"] / row["WRTN_PREM_AMT"])
    else:
        df.set_value(index, "LOSS_RATIO", 0)
    
df['LOSS_RATIO'] = df['LOSS_RATIO'].replace(99998, 0)
df['LOSS_RATIO'] = df['LOSS_RATIO'].replace(99997, 0)


In [79]:
df["LOSS_RATIO"]

0         0.000000
1         0.000000
2         0.000000
3         0.000000
4         0.000000
5         0.000000
6         0.000000
7         0.000000
8         0.000000
9         0.000000
10        0.000000
11        0.000000
12        0.000000
13        0.000000
14        0.000000
15        0.000000
16        0.000000
17        0.000000
18        0.000000
19        0.000000
20        0.000000
21        0.000000
22        1.224879
23        0.000000
24        0.000000
25        0.000000
26        0.000000
27        0.000000
28        0.000000
29        0.000000
            ...   
190540    0.000000
190541    0.000000
190542    0.000000
190543    0.000000
190544    0.000000
190545    0.000000
190546    0.000000
190547    0.000000
190548    0.000000
190549    0.000000
190550    0.000000
190551    0.000000
190552    0.000000
190553    0.000000
190554    0.000000
190555    0.000000
190556    0.000000
190557    0.000000
190558    0.000000
190559    0.000000
190560    0.000000
190561    0.

In [80]:
df[["PRD_INCRD_LOSSES_AMT","WRTN_PREM_AMT","LOSS_RATIO"]]

Unnamed: 0,PRD_INCRD_LOSSES_AMT,WRTN_PREM_AMT,LOSS_RATIO
0,0.00,12298.52,0.000000
1,0.00,0.00,0.000000
2,0.00,0.00,0.000000
3,0.00,0.00,0.000000
4,0.00,0.00,0.000000
5,0.00,0.00,0.000000
6,0.00,0.00,0.000000
7,0.00,0.00,0.000000
8,0.00,0.00,0.000000
9,0.00,0.00,0.000000


In [81]:
df[["RETENTION_POLY_QTY","PREV_POLY_INFORCE_QTY"]]

Unnamed: 0,RETENTION_POLY_QTY,PREV_POLY_INFORCE_QTY
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [82]:
df.head(50)

Unnamed: 0,AGENCY_ID,PRIMARY_AGENCY_ID,PROD_ABBR,PROD_LINE,STATE_ABBR,STAT_PROFILE_DATE_YEAR,RETENTION_POLY_QTY,POLY_INFORCE_QTY,PREV_POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,...,PL_BOUND_CT_ELINKS,PL_QUO_CT_ELINKS,PL_BOUND_CT_PLRANK,PL_QUO_CT_PLRANK,PL_BOUND_CT_eQTte,PL_QUO_CT_eQTte,PL_BOUND_CT_APPLIED,PL_QUO_CT_APPLIED,PL_BOUND_CT_TRANSACTNOW,PL_QUO_CT_TRANSACTNOW
0,3,3,BOILERMACH,CL,IN,2005,0,0,0,160.0,...,0,0,0,103,50,288,0,0,0,0
1,3,3,BOILERMACH,CL,IN,2006,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0
2,3,3,BOILERMACH,CL,IN,2007,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0
3,3,3,BOILERMACH,CL,IN,2008,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0
4,3,3,BOILERMACH,CL,IN,2009,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0
5,3,3,BOILERMACH,CL,IN,2010,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0
6,3,3,BOILERMACH,CL,IN,2011,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0
7,3,3,BOILERMACH,CL,IN,2012,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0
8,3,3,BOILERMACH,CL,IN,2013,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0
9,3,3,BOILERMACH,CL,IN,2014,0,0,0,0.0,...,0,0,0,103,50,288,0,0,0,0


In [84]:
average = ["PREV_POLY_INFORCE_QTY","POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT"]

states = df.STATE_ABBR.unique()
prod_line = df.PROD_LINE.unique()
years = df.STAT_PROFILE_DATE_YEAR.unique()
   
state_dict = {}    
for state in states:
    for line in prod_line:
            for year in years:
                state_mean = df[(df["STATE_ABBR"] == state) & (df["PROD_LINE"] == line) & (df["STAT_PROFILE_DATE_YEAR"] == year)].mean()
                state_dict[(state, line)] = state_mean


In [85]:
for index, row in df.iterrows():
    for col in average:
        if(row[col] == 0):
            try:
                newvalue = state_dict[(row["STATE_ABBR"], row["PROD_LINE"])][col]
            except(ValueError): #No average can be found since its all 0
                newvalue = 0 
            df.set_value(index, col, newvalue)

In [86]:
df[average].head(10)

Unnamed: 0,PREV_POLY_INFORCE_QTY,POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,WRTN_PREM_AMT,PRD_ERND_PREM_AMT
0,5,38,160.0,12298.52,12023.08
1,5,38,4522.760189,43178.105074,44998.273698
2,5,38,4522.760189,43178.105074,44998.273698
3,5,38,4522.760189,43178.105074,44998.273698
4,5,38,4522.760189,43178.105074,44998.273698
5,5,38,4522.760189,43178.105074,44998.273698
6,5,38,4522.760189,43178.105074,44998.273698
7,5,38,4522.760189,43178.105074,44998.273698
8,5,38,4522.760189,43178.105074,44998.273698
9,5,38,4522.760189,43178.105074,44998.273698


In [87]:
statedict = {np.NaN : 0, "IN" : 1, "KY" : 2, "MI" : 3, "OH" : 4, "PA" : 5, "WV" : 6}
proddict = {np.NaN : 0, "BOILERMACH" : 1, "BOP" : 2, "COMMAUTO" : 3, "COMMINLMAR" : 4, "COMMPOL": 5, "COMMUMBREL" : 6, "CRIME" : 7,
           "FIREALLIED" : 8, "GARAGE" : 9, "GENERALIAB" : 10, "WORKCOMP" : 11, "ANNIV" : 12, "ANNIV   12":13, "CYCLES":14,
           "DTALK":15, "DWELLFIRE":16, "HOMEONWERS":17,"MOBILEHOME":18,"MOTORHOM12":19,"MOTORHOME":20,"PERSUMBREL":21, "YACHT":22,
           "DTALK   12":23, "PERSINLMAR":24, "SNOWMOBILE":25,"CYCLES  12":26,"SNOWMOBBI12":27,"PERSAIP":28}
prodlinedict = {np.NaN : 0, "CL":1, "PL":2}
vendordict = {'Unknown': 0, 'A': 1, 'B':2, 'C' : 3, 'E':4}

df['STATE_ID'] = df['STATE_ABBR'].map(statedict)
df['PROD_ID'] = df["PROD_ABBR"].map(proddict)
df['PROD_LINE_ID'] = df['PROD_LINE'].map(prodlinedict)
df['VENDOR_ID'] = df['VENDOR'].map(vendordict)
df[["STATE_ABBR", "PROD_ABBR", "PROD_LINE", "VENDOR", "STATE_ID", "PROD_ID", "PROD_LINE_ID", "VENDOR_ID"]].head(50)

Unnamed: 0,STATE_ABBR,PROD_ABBR,PROD_LINE,VENDOR,STATE_ID,PROD_ID,PROD_LINE_ID,VENDOR_ID
0,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
1,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
2,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
3,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
4,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
5,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
6,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
7,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
8,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0
9,IN,BOILERMACH,CL,Unknown,1,1.0,1,0.0


In [88]:
def ProgressBar(count, total):
    complete = ["=", "=", "=", "=","=","=","=","=","=","="]
    empty = ["_","_","_","_","_","_","_","_","_","_"]
    percentage = int(count/total*10)
    if(percentage%1 == 0):
        clear_output()
        output = "|";
        for index in range(0, percentage):
            output += complete[index]
        for index in range(percentage, 10):
            output += empty[index]
        output += "|"
        print(output)
        print(str(percentage*10) + "%")

In [89]:
#POLY_GROWTH_RATIO = 
#PREM_GROWTH_RATIO = WRTN_PREM_AMT (Current)- WRTN_PREM_AMT (Last Year) / (Current + Last year)

#index, row, df.itter
#try:
#   prevyear = df where(agencyid == id, prodid == prodid, prodline == prodline, year == currentyear-1)[columnname]
#except:
#prevyear = 0
#if prevyear > 0
#df put growthratio (this-prev/prev)

column = "POLY_INFORCE_QTY"
columnname= "POLY_GROWTH_RATIO"
def GrowthRatioCalculator(df, column, columnname):
    for index, row in df.iterrows():
        if(index > 0): #Start off at 1 so we don't get an index of -1
            prevyear = df.loc[index-1] #Get the previous item in the dataframe
            if(prevyear["STATE_ABBR"] == row["STATE_ABBR"]) & (prevyear["PROD_LINE"] == row["PROD_LINE"]) & (prevyear["STAT_PROFILE_DATE_YEAR"] == (row["STAT_PROFILE_DATE_YEAR"]-1)) & (prevyear["AGENCY_ID"] == row["AGENCY_ID"])& (prevyear["PROD_ABBR"] == row["PROD_ABBR"]):
                prevyear = prevyear[column] #Get the actual value of the previous year
                try:
                    if(prevyear > 0): #Check if its bigger than 0/ works
                        df.set_value(index, columnname, (row[column] -prevyear)/prevyear) #Set the value in the column to be the percentage
                        print((row[column] -prevyear)/prevyear) #Print out the value so we know it's working

                except:
                    #Can go wrong when we continue to a new product.
                    #If there is a missmatch within the if statement and it returns false, it will generate a kind of null value.
                    #We can not work with this value so it will throw an exception.
                    #If it goes wrong, put a 0 in the dataframe.
                    df.set_value(index, columnname, 0)
    

In [90]:
df.head(50)

Unnamed: 0,AGENCY_ID,PRIMARY_AGENCY_ID,PROD_ABBR,PROD_LINE,STATE_ABBR,STAT_PROFILE_DATE_YEAR,RETENTION_POLY_QTY,POLY_INFORCE_QTY,PREV_POLY_INFORCE_QTY,NB_WRTN_PREM_AMT,...,PL_BOUND_CT_eQTte,PL_QUO_CT_eQTte,PL_BOUND_CT_APPLIED,PL_QUO_CT_APPLIED,PL_BOUND_CT_TRANSACTNOW,PL_QUO_CT_TRANSACTNOW,STATE_ID,PROD_ID,PROD_LINE_ID,VENDOR_ID
0,3,3,BOILERMACH,CL,IN,2005,0,38,5,160.0,...,50,288,0,0,0,0,1,1.0,1,0.0
1,3,3,BOILERMACH,CL,IN,2006,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0
2,3,3,BOILERMACH,CL,IN,2007,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0
3,3,3,BOILERMACH,CL,IN,2008,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0
4,3,3,BOILERMACH,CL,IN,2009,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0
5,3,3,BOILERMACH,CL,IN,2010,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0
6,3,3,BOILERMACH,CL,IN,2011,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0
7,3,3,BOILERMACH,CL,IN,2012,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0
8,3,3,BOILERMACH,CL,IN,2013,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0
9,3,3,BOILERMACH,CL,IN,2014,0,38,5,4522.760189,...,50,288,0,0,0,0,1,1.0,1,0.0


In [92]:
# split PL and CL datasets
df_cl = df[df.PROD_LINE=='CL']
df_pl = df[df.PROD_LINE=='PL']

# remove irrelevant columns from splits
df_cl = df_cl.filter(regex='^(?!PL_)\w+', axis=1)
df_pl = df_pl.filter(regex='^(?!CL_)\w+', axis=1)

# write dataframes to csv files
df_cl.to_csv(datafolder+'cleaned_CL_'+filename)
df_pl.to_csv(datafolder+'cleaned_PL_'+filename)