In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## 1) Adding new Features

### 1. Popularity Density 
#### : 1 if in main city else 0

In [None]:
#Check index for main region code
idx = df[df['Region_Code'] == 28].index

r_lst = []
for i in range(len(df)):
    if i in idx:
        r_lst.append('main')
    else:
        r_lst.append('notmain')

df['Population'] = r_lst

### 2. Basic Insurance Customer
##### : Groups with the lowest annual premium are treated as having only basic insurance. (Basic : 1, etc : 0)

In [6]:
#Check index for basic annual premium
idx = df[df['Annual_Premium'] == 2630].index

a_lst = []
for i in range(len(df)):
    if i in idx: a_lst.append('basic')
    else: a_lst.append('option')
        
df['Basic_Annual'] = a_lst

### 3. Beneficiary of insurance
#### : Pre-insured and treated as an insurance beneficiary if there is a history of car damage (Yes: 1, No: 0)

In [7]:
df['Beneficiary'] = np.where((df['Previously_Insured'] == 1) & (df['Vehicle_Damage'] == 'Yes'), 'benefit', 'not_benefit')

### 4. Risk of an accident
##### : If customer has a few years of experience in an accident, it is judged that its driving habit is dangerous
##### -> High risk of accidents has a high need for insurance

In [8]:
df['Danger'] = -1

for i in range(len(df)):
  #고위험군
  if ((df.loc[i, 'Vehicle_Damage'] == 'Yes') & (df.loc[i, 'Vehicle_Age'] == '< 1 Year')):
    df.loc[i, 'Danger'] = 'high'
  #저위험군
  elif ((df.loc[i, 'Vehicle_Damage'] == 'No') & (df.loc[i, 'Vehicle_Age'] == '> 2 Years')) == 1:
    df.loc[i, 'Danger'] = 'low'
  #그 외
  else:
    df.loc[i, 'Danger'] = 'mid'

### 5. Risk of an accident2
#### : Car accident while no insurance beneficienary.

In [9]:
df['N_Danger'] = np.where((df['Previously_Insured'] == 0) & (df['Vehicle_Damage'] == 'Yes'), 'high', 'low')

### 6. Percentage of accident experiences by age
##### : Percentage of people who have experienced accidents by age group

In [10]:
df1=df[df['Age']<= 25]
r1 = round(df1['Vehicle_Damage'].value_counts()[1] / len(df1),4)

df2=df[(df['Age']>=26) & (df['Age']<=36)]
r2 = round(df2['Vehicle_Damage'].value_counts()[1] / len(df2),4)

df3=df[(df['Age']>=37) & (df['Age']<=49)]
r3 = round(df3['Vehicle_Damage'].value_counts()[0] / len(df3),4)

df4=df[(df['Age']>=50) & (df['Age']<=85)]
r4 = round(df4['Vehicle_Damage'].value_counts()[1] / len(df4),4)

df['Age_damaged'] = -1
for i in range(len(df)):
    if df.loc[i,'Age'] < 25:
        df.loc[i,'Age_damaged'] = r1
    elif (df.loc[i,'Age'] >= 26) & (df.loc[i,'Age'] <= 36):
        df.loc[i,'Age_damaged'] = r2
    elif (df.loc[i,'Age'] >= 37) & (df.loc[i,'Age'] <= 49):
        df.loc[i,'Age_damaged'] = r3
    else:
        df.loc[i,'Age_damaged'] = r4

### 7. Grouping Main Channel
##### : Set the most frequent of each channel within the channel variable as the primary channel

In [12]:
df['Main_Channel'] = -1
for i in range(len(df)):
    if (df.loc[i,'Policy_Sales_Channel'] == 152) | (df.loc[i,'Policy_Sales_Channel'] == 26) | (df.loc[i,'Policy_Sales_Channel'] == 124):
        df.loc[i,'Main_Channel'] = 'main_ch'
    else:
        df.loc[i,'Main_Channel'] = 'notmain_ch'

### 8. Grouping Major channels by age group
##### : Grouping since there is a channel with a high frequency by age group (by age 36)

In [13]:
df['Age_Channel'] = 1000000
for i in range(len(df)):
    if (df.loc[i,'Policy_Sales_Channel'] == 26) | (df.loc[i,'Policy_Sales_Channel'] == 124):
        df.loc[i,'Age_Channel'] = 'main_over'
    elif (df.loc[i,'Policy_Sales_Channel'] == 152) | (df.loc[i,'Policy_Sales_Channel'] == 160):
        df.loc[i,'Age_Channel'] = 'main_under'
    else:
        df.loc[i,"Age_Channel"] = 'channel'

### 9. Age Group
##### : Add age group variables because many characteristics are determined by age group

In [14]:
df['Age_group'] = 100000

for i in range(len(df)):
    if df.loc[i,'Age'] <= 25:
        df.loc[i,'Age_group'] = '~25'
    elif (df.loc[i,'Age'] >= 26) & (df.loc[i,'Age'] <= 36):
        df.loc[i,'Age_group'] = '26~36'
    elif (df.loc[i,'Age'] >= 37) & (df.loc[i,'Age'] <= 49):
        df.loc[i,'Age_group'] = '37~49'
    else:
        df.loc[i,'Age_group'] = '50~'

### 10. Young & Rich
#### : When previously Insued and Response are '1', we guess they are young and rich

In [15]:
df['Young_Rich'] = 1000000

for i in range(len(df)):
    if (df.loc[i,'Previously_Insured'] == 1) & (df.loc[i,'Response'] == 1) :
        df.loc[i,'Young_Rich'] = 'Not_YR'
    else:
        df.loc[i,'Young_Rich'] = 'YR'

In [None]:
# Export Dataframe that adds the variables
df.drop('id', axis = 1, inplace = True)
df.to_csv('var_df.csv',index = False)

## 2 ) Data Generation for Sub-Modeling
##### Data is divided into two groups by Vehicle_age because their sub data have shown different distributions

In [None]:
#1 : Adding new variables X & submodeling X
df = pd.read_csv('origin_df.csv')

#2-1 : Adding new variables X & submodeling O
model_b = df[df['Vehicle_Age'] == '> 2 Years']
model_b.to_csv("origin_balance.csv", index = False)

#2-2 : Adding new variables X & submodeling O
model_imb = df[df['Vehicle_Age'] != '> 2 Years']
model_imb.to_csv("origin_imbalance.csv", index = False)

#3 : Adding new variables O & submodeling X
var_df = pd.read_csv('var_df.csv')

#3-1 : Adding new variables O & submodeling O
model_imb = var_df[var_df['Vehicle_Age'] == '> 2 Years']
model_imb.to_csv("var_balance.csv", index = False)

#3-2 : Adding new variables O & submodeling O
model_imb = df1[df1['Vehicle_Age'] != '> 2 Years']
model_imb.to_csv("var_imbalance.csv", index = False)

## 3 ) Data Preprocessing

In [None]:
#Split data and maintain the origin ratio of target variables
def data_split(df):

    df_zero = df[df["target"] == 0]
    df_one = df[df["target"] == 1]

    X, y = df_zero.drop(["target"], axis = 1), df_zero["target"]
    a, b, c, d = train_test_split(X, y, test_size = 0.3)

    X, y = df_one.drop(["target"], axis = 1), df_one["target"]
    e, f, g, h= train_test_split(X, y, test_size = 0.3)

    temp1 = pd.concat([a, c], axis = 1)
    temp2 = pd.concat([b, d], axis = 1)
    temp3 = pd.concat([e, g], axis = 1)
    temp4 = pd.concat([f, h], axis = 1)

    df_train = pd.concat([temp1, temp3], axis = 0)
    df_test = pd.concat([temp2, temp4], axis = 0)

    df_train = df_train.sample(frac = 1)
    df_test = df_test.sample(frac = 1)

    df_train.reset_index(drop = True, inplace = True)
    df_test.reset_index(drop = True, inplace = True)


    return df_train, df_test

In [None]:
#Function for LabelEncoding
def LabelEncoding(df):
    
    o_lst = df.select_dtypes('object').columns.tolist()
    le = LabelEncoder()

    for col in o_lst:
        le = LabelEncoder()
        le.fit(df[col])
        trs = le.transform(df[col])
        df['encode_'+col] = trs
    df = df.drop(o_lst, axis=1)

    return df

In [None]:
#Function for OneHotEncoding
def OnehotEncoding(df):
    
    o_lst = df.select_dtypes('object').columns.tolist()
    df = pd.get_dummies(data = df, columns = o_lst, prefix = 'encode', drop_first = True)

    return df

In [None]:
#Find errors in the data collection process and outliers that do not conform to common sense
def outlier(df):
    
    ndf = df[df['Driving_License'] == 0]
    idx = ndf[ndf['Vehicle_Damage'] == 'No'].index
    df.drop(index=idx, inplace = True)
    
    return df

In [None]:
#Generating Train Data according to the number of cases of the preprocessing methodology
def train_ver(df_tr, ver,name):
    
    if ver == 1:
        ndf_train = outlier(df_tr)
        ndf_train = LabelEncoding(ndf_train)
        ndf_train.to_csv(name + "1.csv", index = False)
    
    elif ver == 2:
        ndf_train = LabelEncoding(df_tr)
        ndf_train.to_csv(name + "2.csv", index = False)

    elif ver == 3:
        ndf_train = outlier(df_tr)
        ndf_train = OnehotEncoding(ndf_train)
        ndf_train.to_csv(name + "3.csv", index = False)

    else:
        ndf_train = OnehotEncoding(df_tr)
        ndf_train.to_csv(name + "4.csv", index = False)

In [None]:
#Generating Test Data according to the number of cases of the preprocessing methodology
def test_ver(df_test, ver,name):

    if (ver == 1) | (ver == 2):
        ndf_test = LabelEncoding(df_test)
        ndf_test.to_csv(name + str(ver) + ".csv", index = False)

    elif (ver == 3) | (ver == 4):
        ndf_test = OnehotEncoding(df_test)
        ndf_test.to_csv(name + str(ver) + ".csv", index = False)

In [None]:
DP = ["origin_imbalance.csv", "var_imbalance.csv", "origin_balance.csv", "var_balance.csv", 
      "origin_df.csv", "var_df.csv"]

def make_data(DATA_PATH):
    
    df = pd.read_csv(DATA_PATH)
    df_train, df_test = data_split(df)
    name = list(DATA_PATH)[0]
    _idx = [list(DATA_PATH).index("."), list(DATA_PATH).index("_")]
    
    if len(DATA_PATH) <= 13: name += _idx[0] - 1
    else: name += _idx[1] + 1
    
    for i in range(1, 5):
        df_tr, df_ts = df_train.copy(), df_test.copy()
        train_ver(df_tr, i, name + "_train")
        test_ver(df_ts, i, name + "_test")

for DATA_PATH in DP:
    make_data(DATA_PATH)