In [None]:
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('scikit-learn version:', sk.__version__)
print('matplotlib version:', matplotlib.__version__)

%matplotlib inline

In [None]:
datafolder = 'data/'
filename = 'dataset_informa.csv'
df = pd.read_csv(datafolder + filename, sep=',', low_memory=False, encoding = 'ISO-8859-1')
print(df.shape)
df.head()

In [None]:
# Primary_Agency_ID: When this column data is missing/99999 we are going to replace it with the Agency_ID. 
# We can not find any way to associate a primary agency with the agency in question so will replace it with its ID.
df['PRIMARY_AGENCY_ID'] = np.where(df['PRIMARY_AGENCY_ID'] == 0, df['PRIMARY_AGENCY_ID'], df['AGENCY_ID'])

# Prev_Poly_Inforce_QTY: Contains 99999 values. 
# These values mean there are no previous years or this year did not have data (CL).
# These values will be replaces by 0
df['PREV_POLY_INFORCE_QTY'] = df['PREV_POLY_INFORCE_QTY'].replace(99999,0)

# Agency_appointment_year: 5000 missing data, irrelevant column, will be replaced with average.
mean = df['AGENCY_APPOINTMENT_YEAR'].mean()
df['AGENCY_APPOINTMENT_YEAR'].replace(99999, mean) 

# Active_Producers, Max_age and min_age: 
# If the amount of producers is not set (99999) this can be set to 0, this also goes for max and min age.
df['ACTIVE_PRODUCERS'].replace(99999, 0) 
df['MAX_AGE'].replace(99999, 0) 
df['MIN_AGE'].replace(99999, 0) 

# (VENDOR)_START_YEAR: Values are varied between 1994 and 2015. 
# When the value is missing (99999) the vendor was not used. 
# All 99999 values can be replaced with 0.
df['PL_START_YEAR'].replace(99999, 0)
df['CL_START_YEAR'].replace(99999, 0)

# (VENDOR)_END _YEAR: The year the agency stopped using the vendor. 
# This can either mean the vendor is never used or the vendor is still in use. 
# If the vendor has never been used we can replace the value with 0, if the vendor is still in use we can replace it with 1.
df['CL_END_YEAR'] = np.where(df['CL_START_YEAR'] == 0, 0, 1)
df['PL_END_YEAR'] = np.where(df['PL_START_YEAR'] == 0, 0, 1)


In [None]:
#Get a list of all values needing to be recalculated per item
recal = ["POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT", "PRD_INCRD_LOSSES_AMT"]
df_incomplete = df[(df["MONTHS"] != 12)]
df_filtered = df_incomplete
for index, row in df_filtered.iterrows():
    #Amount of months
    months = row["MONTHS"]
    for column in recal:
        newvalue = row[column] * (12 - months) #Value times amount of months missing
        df_filtered.set_value(index, column, newvalue) #Set new row in dataset
    row["MONTHS"] = 12 #Reset the months to be 12
df_filtered[recal].head(5)





In [None]:
#RETENTION_RATIO = RETENTION_POLY_QTY / PREV_POLY_INFORCE_QTY
#LOSS_RATIO = PRD_INCRD_LOSSES_AMT / WRTN_PREM_AMT

for index, row in df_filtered.iterrows():
    if (row["WRTN_PREM_AMT"] > 0) and (row["PRD_INCRD_LOSSES_AMT"] > 0):
        df_filtered.set_value(index, "LOSS_RATIO", row["PRD_INCRD_LOSSES_AMT"] / row["WRTN_PREM_AMT"])
    else:
        df_filtered.set_value(index, "LOSS_RATIO", 0)
    if (row["PREV_POLY_INFORCE_QTY"] > 0) and (row["RETENTION_POLY_QTY"] > 0):
        df_filtered.set_value(index, "RETENTION_RATIO", row["RETENTION_POLY_QTY"] / row["PREV_POLY_INFORCE_QTY"])
    else:
        df_filtered.set_value(index, "RETENTION_RATIO",0)

In [None]:
df.head(50)

In [None]:
df_average = df
average = ["PREV_POLY_INFORCE_QTY","POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT"]

states = df_average.STATE_ABBR.unique()
prod_line = df_average.PROD_LINE.unique()
   
state_dict = {}    
for state in states:
    for line in prod_line:
        state_mean = df_average[(df_average["STATE_ABBR"] == state) & (df_average["PROD_LINE"] == line)].mean()
        state_dict[(state, line)] = state_mean


In [None]:
for index, row in df_average.iterrows():
    for col in average:
        if(row[col] == 0):
            try:
                newvalue = state_dict[(row["STATE_ABBR"], row["PROD_LINE"])]
            except(ValueError): #No average can be found since its all 0
                newvalue = 0 
            df_average.set_value(index, col, newvalue[col])

In [None]:
df_average[average].head(10)

In [None]:
df.STATE_ABBR.unique()
statedict = {np.NaN : 0, "IN" : 1, "KY" : 2, "MI" : 3, "OH" : 4, "PA" : 5, "WV" : 6}
df.PROD_ABBR.unique()
proddict = {np.NaN : 0, "BOILERMACH" : 1, "BOP" : 2, "COMMAUTO" : 3, "COMMINLMAR" : 4, "COMMPOL": 5, "COMMUMBREL" : 6, "CRIME" : 7,
           "FIREALLIED" : 8, "GARAGE" : 9, "GENERALIAB" : 10, "WORKCOMP" : 11, "ANNIV" : 12, "ANNIV   12":13, "CYCLES":14,
           "DTALK":15, "DWELLFIRE":16, "HOMEONWERS":17,"MOBILEHOME":18,"MOTORHOM12":19,"MOTORHOME":20,"PERSUMBREL":21, "YACHT":22,
           "DTALK   12":23, "PERSINLMAR":24, "SNOWMOBILE":25,"CYCLES  12":26,"SNOWMOBBI12":27,"PERSAIP":28}
df.PROD_LINE.unique()
prodlinedict = {np.NaN : 0, "CL":1, "PL":2}
vendordict = {'Unknown': 0, 'A': 1, 'B':2, 'C' : 3, 'E':4}

df['STATE_ID'] = df['STATE_ABBR'].map(statedict)
df['PROD_ID'] = df["PROD_ABBR"].map(proddict)
df['PROD_LINE_ID'] = df['PROD_LINE'].map(prodlinedict)
df['VENDOR_ID'] = df['VENDOR'].map(vendordict)
df[["STATE_ABBR", "PROD_ABBR", "PROD_LINE", "VENDOR", "STATE_ID", "PROD_ID", "PROD_LINE_ID", "VENDOR_ID"]].head(50)

In [None]:
# split PL and CL datasets
df_cl = df[df.PROD_LINE=='CL']
df_pl = df[df.PROD_LINE=='PL']

# remove irrelevant columns from splits
df_cl = df_cl.filter(regex='^(?!PL_)\w+', axis=1)
df_pl = df_pl.filter(regex='^(?!CL_)\w+', axis=1)

# write dataframes to csv files
df_cl.to_csv('cleaned_CL_'+filename)
df_pl.to_csv('cleaned_PL_'+filename)

In [None]:
#POLY_GROWTH_RATIO = 
#PREM_GROWTH_RATIO = WRTN_PREM_AMT (Current)- WRTN_PREM_AMT (Last Year) / (Current + Last year)

#index, row, df.itter
#try:
#   prevyear = df where(agencyid == id, prodid == prodid, prodline == prodline, year == currentyear-1)[columnname]
#except:
#prevyear = 0
#if prevyear > 0
#df put growthratio (this-prev/prev)

column = "POLY_INFORCE_QTY"
columnname= "POLY_GROWTH_RATIO"
def GrowthRatioCalculator(df, column, columnname):
    for index, row in df.iterrows():
        if(index > 0): #Start off at 1 so we don't get an index of -1
            prevyear = df.loc[index-1] #Get the previous item in the dataframe
            if(prevyear["STATE_ABBR"] == row["STATE_ABBR"]) & (prevyear["PROD_LINE"] == row["PROD_LINE"]) & (prevyear["STAT_PROFILE_DATE_YEAR"] == (row["STAT_PROFILE_DATE_YEAR"]-1)) & (prevyear["AGENCY_ID"] == row["AGENCY_ID"])& (prevyear["PROD_ABBR"] == row["PROD_ABBR"]):
                prevyear = prevyear[column] #Get the actual value of the previous year
                try:
                    if(prevyear > 0): #Check if its bigger than 0/ works
                        df.set_value(index, columnname, (row[column] -prevyear)/prevyear) #Set the value in the column to be the percentage
                        print((row[column] -prevyear)/prevyear) #Print out the value so we know it's working

                except:
                    #Can go wrong when we continue to a new product.
                    #If there is a missmatch within the if statement and it returns false, it will generate a kind of null value.
                    #We can not work with this value so it will throw an exception.
                    #If it goes wrong, put a 0 in the dataframe.
                    df.set_value(index, columnname, 0)
    

In [None]:
df.head(50)