In [None]:
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('scikit-learn version:', sk.__version__)
print('matplotlib version:', matplotlib.__version__)

%matplotlib inline

In [None]:
datafolder = 'data/'
filename = 'dataset_informa.csv'
df = pd.read_csv(datafolder + filename, sep=',', low_memory=False, encoding = 'ISO-8859-1')
print(df.shape)
df.head()

In [None]:
df.drop(['RETENTION_RATIO'], axis = 1, inplace = True)

# Primary_Agency_ID: When this column data is missing/99999 we are going to replace it with the Agency_ID. 
# We can not find any way to associate a primary agency with the agency in question so will replace it with its ID.
df['PRIMARY_AGENCY_ID'] = np.where(df['PRIMARY_AGENCY_ID'] == 0, df['PRIMARY_AGENCY_ID'], df['AGENCY_ID'])

# (VENDOR)_END _YEAR: The year the agency stopped using the vendor. 
# This can either mean the vendor is never used or the vendor is still in use. 
# If the vendor has never been used we can replace the value with 0, if the vendor is still in use we can replace it with 1.
df['CL_END_YEAR'] = np.where(df['CL_START_YEAR'] == 0, 0, 1)
df['PL_END_YEAR'] = np.where(df['PL_START_YEAR'] == 0, 0, 1)

# Agency_appointment_year: 5000 missing data, irrelevant column, will be replaced with average.
mean = df['AGENCY_APPOINTMENT_YEAR'].mean()
df['AGENCY_APPOINTMENT_YEAR'].replace(99999, mean, inplace=True) 

# replace the rest of the 99999 values with 0
df.replace(99999, 0, inplace=True)


In [None]:
#Get a list of all values needing to be recalculated per item
recal = ["POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT", "PRD_INCRD_LOSSES_AMT"]
df_incomplete = df[(df["MONTHS"] != 12)]
df_filtered = df_incomplete
for index, row in df_filtered.iterrows():
    #Amount of months
    months = row["MONTHS"]
    for column in recal:
        newvalue = row[column] * (12 - months) #Value times amount of months missing
        df_filtered.set_value(index, column, newvalue) #Set new row in dataset
    row["MONTHS"] = 12 #Reset the months to be 12
df_filtered[recal].head(5)





In [None]:
df_old = df_filtered #Copy of the dataset for testing
df_incomplete[recal].head(5) #Get incomplete rows in the dataframe

In [None]:
#RETENTION_RATIO = RETENTION_POLY_QTY / PREV_POLY_INFORCE_QTYDataFrame.drop
#LOSS_RATIO = PRD_INCRD_LOSSES_AMT / WRTN_PREM_AMT

for index, row in df_filtered.iterrows():
    if (row["WRTN_PREM_AMT"] > 0) and (row["PRD_INCRD_LOSSES_AMT"] > 0):
        df_filtered.set_value(index, "LOSS_RATIO", row["PRD_INCRD_LOSSES_AMT"] / row["WRTN_PREM_AMT"])
    else:
        df_filtered.set_value(index, "LOSS_RATIO", 0)
        
df_filtered['LOSS_RATIO'].replace(99998, 0, inplace=True)
df_filtered['LOSS_RATIO'].replace(99997, 0, inplace=True)


    

In [None]:
df_old["LOSS_RATIO"]

In [None]:
df_filtered[["PRD_INCRD_LOSSES_AMT","WRTN_PREM_AMT","LOSS_RATIO"]]

In [None]:
df_filtered[["RETENTION_POLY_QTY","PREV_POLY_INFORCE_QTY"]]

In [None]:
df['VENDOR_ID'] = df['VENDOR'].rank(method='dense', ascending=False).astype(int)
df['STATE_ID'] = df['STATE_ABBR'].rank(method='dense', ascending=False).astype(int)
df['PRODUCT_LINE_ID'] = df['PROD_LINE'].rank(method='dense', ascending=False).astype(int)
df['PRODUCT_ID'] = df['PROD_ABBR'].rank(method='dense', ascending=False).astype(int)

In [None]:
df.head(50)

In [None]:
df_average = df
average = ["PREV_POLY_INFORCE_QTY","POLY_INFORCE_QTY", "NB_WRTN_PREM_AMT", "WRTN_PREM_AMT", "PRD_ERND_PREM_AMT"]

states = df_average.STATE_ABBR.unique()
prod_line = df_average.PROD_LINE.unique()
   
state_dict = {}    
for state in states:
    for line in prod_line:
        state_mean = df_average[(df_average["STATE_ABBR"] == state) & (df_average["PROD_LINE"] == line)].mean()
        state_dict[(state, line)] = state_mean


In [None]:
for index, row in df_average.iterrows():
    for col in average:
        if(row[col] == 0):
            try:
                newvalue = state_dict[(row["STATE_ABBR"], row["PROD_LINE"])]
            except(ValueError): #No average can be found since its all 0
                newvalue = 0 
            df_average.set_value(index, col, newvalue[col])


In [None]:
df_average[average].head(10)

In [None]:
df_average.replace(99999, 0, inplace=True)

In [None]:
# split PL and CL datasets
df_cl = df[df.PROD_LINE=='CL']
df_pl = df[df.PROD_LINE=='PL']

# remove irrelevant columns from splits
df_cl = df_cl.filter(regex='^(?!PL_)\w+', axis=1)
df_pl = df_pl.filter(regex='^(?!CL_)\w+', axis=1)

# write dataframes to csv files
df_cl.to_csv('cleaned_CL_'+filename)
df_pl.to_csv('cleaned_PL_'+filename)