In [16]:
import pandas as pd

In [17]:
train = pd.read_csv('/content/sample_data/train.csv')
test = pd.read_csv('/content/sample_data/test.csv')

In [18]:
len(train), len(test)

(1200000, 800000)

In [19]:
df = pd.concat([train, test], axis=0)

In [20]:
del df['id']

In [21]:
# annual income & premium amount has a long tail -> log transformer
from sklearn.preprocessing import FunctionTransformer
import numpy as np

log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
df['Annual Income'] = log_transformer.fit_transform(df[['Annual Income']])
df['Premium Amount'] = log_transformer.fit_transform(df[['Premium Amount']])



In [22]:
# Vehicle Age null -> median
df['Vehicle Age'] = df['Vehicle Age'].fillna(df['Vehicle Age'].median())

# Insurance Duration null -> Mode
df['Insurance Duration'] = df['Insurance Duration'].fillna(df['Insurance Duration'].mode()[0])

np.random.seed(42)

# Age, Number of Dependents null -> Uniform Distribution
for col in ['Age', 'Number of Dependents']:
    df[col] = df[col].fillna(np.random.randint(df['Age'].min(), df['Age'].max()))

# Annual Income, Number of Dependents, Health Score, Credit Score null -> Normal Distribution
for col in ['Annual Income', 'Health Score', 'Credit Score']:
    df[col] = df[col].fillna(np.random.normal(df[col].mean(), df[col].std()))

# Marital Status, Occupation, Customer Feedback null -> Pick one randomly
import random

random.seed(42)

for col in ['Marital Status', 'Occupation', 'Customer Feedback']:
    df[col].fillna(random.choice(df[col].dropna().tolist()), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(random.choice(df[col].dropna().tolist()), inplace=True)


In [23]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   2000000 non-null  float64
 1   Gender                2000000 non-null  object 
 2   Annual Income         2000000 non-null  float64
 3   Marital Status        2000000 non-null  object 
 4   Number of Dependents  2000000 non-null  float64
 5   Education Level       2000000 non-null  object 
 6   Occupation            2000000 non-null  object 
 7   Health Score          2000000 non-null  float64
 8   Location              2000000 non-null  object 
 9   Policy Type           2000000 non-null  object 
 10  Previous Claims       1393169 non-null  float64
 11  Vehicle Age           2000000 non-null  float64
 12  Credit Score          2000000 non-null  float64
 13  Insurance Duration    2000000 non-null  float64
 14  Policy Start Date     2000000 non-null  

In [24]:
# convert 'Policy Start Date' to 'Policy Start Day'
df1 = df.copy()
df1['Policy Start Date'] = pd.to_datetime(df1['Policy Start Date'])
df1['Policy Start Day'] = pd.DatetimeIndex(df1['Policy Start Date']).day
df1['Policy Start Day'].astype('object')
df['Policy Start Day'] = df1['Policy Start Day'].astype('object')
del df['Policy Start Date']
df.insert(14, 'Policy Start Day', df.pop('Policy Start Day'))

In [25]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   2000000 non-null  float64
 1   Gender                2000000 non-null  object 
 2   Annual Income         2000000 non-null  float64
 3   Marital Status        2000000 non-null  object 
 4   Number of Dependents  2000000 non-null  float64
 5   Education Level       2000000 non-null  object 
 6   Occupation            2000000 non-null  object 
 7   Health Score          2000000 non-null  float64
 8   Location              2000000 non-null  object 
 9   Policy Type           2000000 non-null  object 
 10  Previous Claims       1393169 non-null  float64
 11  Vehicle Age           2000000 non-null  float64
 12  Credit Score          2000000 non-null  float64
 13  Insurance Duration    2000000 non-null  float64
 14  Policy Start Day      2000000 non-null  

In [26]:
df['Policy Start Day'].value_counts()

Unnamed: 0_level_0,count
Policy Start Day,Unnamed: 1_level_1
2,69807
21,67785
9,67127
19,66519
25,66517
7,66497
13,66487
8,66421
18,66330
14,66306


In [27]:
%pip install catboost



In [28]:
df = pd.get_dummies(df, dtype='int')

In [29]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 70 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   Age                          2000000 non-null  float64
 1   Annual Income                2000000 non-null  float64
 2   Number of Dependents         2000000 non-null  float64
 3   Health Score                 2000000 non-null  float64
 4   Previous Claims              1393169 non-null  float64
 5   Vehicle Age                  2000000 non-null  float64
 6   Credit Score                 2000000 non-null  float64
 7   Insurance Duration           2000000 non-null  float64
 8   Premium Amount               1200000 non-null  float64
 9   Gender_Female                2000000 non-null  int64  
 10  Gender_Male                  2000000 non-null  int64  
 11  Marital Status_Divorced      2000000 non-null  int64  
 12  Marital Status_Married       2000000 non-null  i

In [31]:
# make a regression model to predict previous claims
df_previous_claims = df[df['Previous Claims'].notna()]
df_no_previous_claims = df[df['Previous Claims'].isna()]

del df_previous_claims['Premium Amount']
del df_no_previous_claims['Premium Amount']

# train_test split
from sklearn.model_selection import train_test_split
train_set, test_set_from_train = train_test_split(df_previous_claims, test_size=0.2, random_state=42)

# split target and features
X_train = df_previous_claims.drop('Previous Claims', axis=1)
y_train = df_previous_claims['Previous Claims'].copy()

# build model
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from catboost import CatBoostRegressor as Catboost

voting_reg = VotingRegressor(
    estimators=[
        ('lin_reg', LinearRegression()),
        ('xgboost', xgb.XGBRegressor(max_depth=8, n_estimators=50, random_state=42)),
        ('catboost',Catboost(iterations=40, depth=7, learning_rate=1, random_state=42)),
    ]
)
voting_reg.fit(X_train, y_train)

0:	learn: 0.9681832	total: 492ms	remaining: 19.2s
1:	learn: 0.9669116	total: 919ms	remaining: 17.5s
2:	learn: 0.9649160	total: 1.35s	remaining: 16.7s
3:	learn: 0.9647926	total: 1.78s	remaining: 16s
4:	learn: 0.9646705	total: 2.21s	remaining: 15.5s
5:	learn: 0.9646185	total: 2.68s	remaining: 15.2s
6:	learn: 0.9645238	total: 3.03s	remaining: 14.3s
7:	learn: 0.9641864	total: 3.25s	remaining: 13s
8:	learn: 0.9641268	total: 3.46s	remaining: 11.9s
9:	learn: 0.9638928	total: 3.69s	remaining: 11.1s
10:	learn: 0.9638398	total: 3.92s	remaining: 10.3s
11:	learn: 0.9637838	total: 4.14s	remaining: 9.65s
12:	learn: 0.9636917	total: 4.35s	remaining: 9.03s
13:	learn: 0.9636183	total: 4.56s	remaining: 8.47s
14:	learn: 0.9635607	total: 4.79s	remaining: 7.98s
15:	learn: 0.9634953	total: 4.98s	remaining: 7.47s
16:	learn: 0.9634227	total: 5.2s	remaining: 7.04s
17:	learn: 0.9632431	total: 5.42s	remaining: 6.62s
18:	learn: 0.9631852	total: 5.61s	remaining: 6.2s
19:	learn: 0.9631194	total: 5.81s	remaining: 5.

In [32]:
# test
X_test = test_set_from_train.drop('Previous Claims', axis=1)
y_test = test_set_from_train['Previous Claims'].copy()

y_pred = voting_reg.predict(X_test)

from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_test, y_pred)
rmse

0.9577458880673393

In [33]:
test_set = df_no_previous_claims.copy()
del test_set['Previous Claims']
y_pred = voting_reg.predict(test_set)

# fill nan of df['Previous Claims'] with y_pred
df.loc[df['Previous Claims'].isna(), 'Previous Claims'] = y_pred

In [34]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 70 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   Age                          2000000 non-null  float64
 1   Annual Income                2000000 non-null  float64
 2   Number of Dependents         2000000 non-null  float64
 3   Health Score                 2000000 non-null  float64
 4   Previous Claims              2000000 non-null  float64
 5   Vehicle Age                  2000000 non-null  float64
 6   Credit Score                 2000000 non-null  float64
 7   Insurance Duration           2000000 non-null  float64
 8   Premium Amount               1200000 non-null  float64
 9   Gender_Female                2000000 non-null  int64  
 10  Gender_Male                  2000000 non-null  int64  
 11  Marital Status_Divorced      2000000 non-null  int64  
 12  Marital Status_Married       2000000 non-null  i

In [35]:
train = df.iloc[:len(train)]
test = df.iloc[len(train):]

In [36]:
train.to_csv('06_train.csv', index=False)
test.to_csv('06_test.csv', index=False)