In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('/content/sample_data/train.csv')
test = pd.read_csv('/content/sample_data/test.csv')

In [3]:
len(train), len(test)

(1200000, 800000)

In [4]:
df = pd.concat([train, test], axis=0)

In [5]:
del df['id']

In [6]:
# annual income & premium amount has a long tail -> log transformer
from sklearn.preprocessing import FunctionTransformer
import numpy as np

log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
df['Annual Income'] = log_transformer.fit_transform(df[['Annual Income']])
df['Premium Amount'] = log_transformer.fit_transform(df[['Premium Amount']])



In [7]:
# Insurance Duration, Age, Number of Dependents null -> Mode
for col in ['Insurance Duration', 'Age', 'Number of Dependents']:
    df[col] = df[col].fillna(df[col].mode()[0])

# Annual Income, Health Score, Credit Score null -> median
for col in ['Vehicle Age', 'Annual Income', 'Health Score', 'Credit Score']:
    df[col] = df[col].fillna(df[col].median())

# Marital Status, Occupation, Customer Feedback null -> Unknown
for col in ['Marital Status', 'Occupation', 'Customer Feedback']:
    df[col].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


In [8]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   2000000 non-null  float64
 1   Gender                2000000 non-null  object 
 2   Annual Income         2000000 non-null  float64
 3   Marital Status        2000000 non-null  object 
 4   Number of Dependents  2000000 non-null  float64
 5   Education Level       2000000 non-null  object 
 6   Occupation            2000000 non-null  object 
 7   Health Score          2000000 non-null  float64
 8   Location              2000000 non-null  object 
 9   Policy Type           2000000 non-null  object 
 10  Previous Claims       1393169 non-null  float64
 11  Vehicle Age           2000000 non-null  float64
 12  Credit Score          2000000 non-null  float64
 13  Insurance Duration    2000000 non-null  float64
 14  Policy Start Date     2000000 non-null  

In [9]:
# convert 'Policy Start Date' to 'Policy Start Year'
df1 = df.copy()
df1['Policy Start Date'] = pd.to_datetime(df1['Policy Start Date'])
df1['Policy Start Year'] = pd.DatetimeIndex(df1['Policy Start Date']).year
df1['Policy Start Year'].astype('object')
df['Policy Start Year'] = df1['Policy Start Year'].astype('object')
del df['Policy Start Date']
df.insert(14, 'Policy Start Year', df.pop('Policy Start Year'))

In [10]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   2000000 non-null  float64
 1   Gender                2000000 non-null  object 
 2   Annual Income         2000000 non-null  float64
 3   Marital Status        2000000 non-null  object 
 4   Number of Dependents  2000000 non-null  float64
 5   Education Level       2000000 non-null  object 
 6   Occupation            2000000 non-null  object 
 7   Health Score          2000000 non-null  float64
 8   Location              2000000 non-null  object 
 9   Policy Type           2000000 non-null  object 
 10  Previous Claims       1393169 non-null  float64
 11  Vehicle Age           2000000 non-null  float64
 12  Credit Score          2000000 non-null  float64
 13  Insurance Duration    2000000 non-null  float64
 14  Policy Start Year     2000000 non-null  

In [11]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [12]:
df = pd.get_dummies(df, dtype='int')

In [13]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 48 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   Age                          2000000 non-null  float64
 1   Annual Income                2000000 non-null  float64
 2   Number of Dependents         2000000 non-null  float64
 3   Health Score                 2000000 non-null  float64
 4   Previous Claims              1393169 non-null  float64
 5   Vehicle Age                  2000000 non-null  float64
 6   Credit Score                 2000000 non-null  float64
 7   Insurance Duration           2000000 non-null  float64
 8   Premium Amount               1200000 non-null  float64
 9   Gender_Female                2000000 non-null  int64  
 10  Gender_Male                  2000000 non-null  int64  
 11  Marital Status_Divorced      2000000 non-null  int64  
 12  Marital Status_Married       2000000 non-null  i

In [14]:
# make a regression model to predict previous claims
df_previous_claims = df[df['Previous Claims'].notna()]
df_no_previous_claims = df[df['Previous Claims'].isna()]

del df_previous_claims['Premium Amount']
del df_no_previous_claims['Premium Amount']

# train_test split
from sklearn.model_selection import train_test_split
train_set, test_set_from_train = train_test_split(df_previous_claims, test_size=0.2, random_state=42)

# split target and features
X_train = df_previous_claims.drop('Previous Claims', axis=1)
y_train = df_previous_claims['Previous Claims'].copy()

# build model
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from catboost import CatBoostRegressor as Catboost

voting_reg = VotingRegressor(
    estimators=[
        ('lin_reg', LinearRegression()),
        ('xgboost', xgb.XGBRegressor(max_depth=8, n_estimators=50, random_state=42)),
        ('catboost',Catboost(iterations=40, depth=7, learning_rate=1, random_state=42)),
    ]
)
voting_reg.fit(X_train, y_train)

0:	learn: 0.9665648	total: 292ms	remaining: 11.4s
1:	learn: 0.9638763	total: 539ms	remaining: 10.2s
2:	learn: 0.9621559	total: 779ms	remaining: 9.61s
3:	learn: 0.9619329	total: 976ms	remaining: 8.79s
4:	learn: 0.9617519	total: 1.2s	remaining: 8.39s
5:	learn: 0.9616733	total: 1.42s	remaining: 8.06s
6:	learn: 0.9616213	total: 1.66s	remaining: 7.83s
7:	learn: 0.9613925	total: 1.88s	remaining: 7.52s
8:	learn: 0.9612903	total: 2.11s	remaining: 7.27s
9:	learn: 0.9612186	total: 2.35s	remaining: 7.05s
10:	learn: 0.9611361	total: 2.58s	remaining: 6.81s
11:	learn: 0.9610756	total: 2.83s	remaining: 6.6s
12:	learn: 0.9610114	total: 3.04s	remaining: 6.32s
13:	learn: 0.9609576	total: 3.29s	remaining: 6.12s
14:	learn: 0.9608948	total: 3.53s	remaining: 5.88s
15:	learn: 0.9607836	total: 3.77s	remaining: 5.66s
16:	learn: 0.9607223	total: 4s	remaining: 5.41s
17:	learn: 0.9606216	total: 4.21s	remaining: 5.14s
18:	learn: 0.9604995	total: 4.46s	remaining: 4.93s
19:	learn: 0.9603139	total: 4.68s	remaining: 4

In [15]:
# test
X_test = test_set_from_train.drop('Previous Claims', axis=1)
y_test = test_set_from_train['Previous Claims'].copy()

y_pred = voting_reg.predict(X_test)

from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_test, y_pred)
rmse

0.9551751175648605

In [16]:
test_set = df_no_previous_claims.copy()
del test_set['Previous Claims']
y_pred = voting_reg.predict(test_set)

# fill nan of df['Previous Claims'] with y_pred
df.loc[df['Previous Claims'].isna(), 'Previous Claims'] = y_pred

In [17]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 48 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   Age                          2000000 non-null  float64
 1   Annual Income                2000000 non-null  float64
 2   Number of Dependents         2000000 non-null  float64
 3   Health Score                 2000000 non-null  float64
 4   Previous Claims              2000000 non-null  float64
 5   Vehicle Age                  2000000 non-null  float64
 6   Credit Score                 2000000 non-null  float64
 7   Insurance Duration           2000000 non-null  float64
 8   Premium Amount               1200000 non-null  float64
 9   Gender_Female                2000000 non-null  int64  
 10  Gender_Male                  2000000 non-null  int64  
 11  Marital Status_Divorced      2000000 non-null  int64  
 12  Marital Status_Married       2000000 non-null  i

In [18]:
train = df.iloc[:len(train)]
test = df.iloc[len(train):]

In [19]:
train.to_csv('07_train.csv', index=False)
test.to_csv('07_test.csv', index=False)