In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('/content/sample_data/train.csv')
test = pd.read_csv('/content/sample_data/test.csv')

In [3]:
len(train), len(test)

(1200000, 800000)

In [4]:
df = pd.concat([train, test], axis=0)

In [5]:
del df['id']

In [6]:
# annual income & premium amount has a long tail -> log transformer
from sklearn.preprocessing import FunctionTransformer
import numpy as np

log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
df['Annual Income'] = log_transformer.fit_transform(df[['Annual Income']])
df['Premium Amount'] = log_transformer.fit_transform(df[['Premium Amount']])



In [7]:
# Vehicle Age null -> median
df['Vehicle Age'] = df['Vehicle Age'].fillna(df['Vehicle Age'].median())

# Insurance Duration null -> Mode
df['Insurance Duration'] = df['Insurance Duration'].fillna(df['Insurance Duration'].mode()[0])

np.random.seed(42)

# Age, Number of Dependents null -> Uniform Distribution
for col in ['Age', 'Number of Dependents']:
    df[col] = df[col].fillna(np.random.randint(df['Age'].min(), df['Age'].max()))

# Annual Income, Number of Dependents, Health Score, Credit Score null -> Normal Distribution
for col in ['Annual Income', 'Health Score', 'Credit Score']:
    df[col] = df[col].fillna(np.random.normal(df[col].mean(), df[col].std()))

# Marital Status, Occupation, Customer Feedback null -> Pick one randomly
import random

random.seed(42)

for col in ['Marital Status', 'Occupation', 'Customer Feedback']:
    df[col].fillna(random.choice(df[col].dropna().tolist()), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(random.choice(df[col].dropna().tolist()), inplace=True)


In [8]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   2000000 non-null  float64
 1   Gender                2000000 non-null  object 
 2   Annual Income         2000000 non-null  float64
 3   Marital Status        2000000 non-null  object 
 4   Number of Dependents  2000000 non-null  float64
 5   Education Level       2000000 non-null  object 
 6   Occupation            2000000 non-null  object 
 7   Health Score          2000000 non-null  float64
 8   Location              2000000 non-null  object 
 9   Policy Type           2000000 non-null  object 
 10  Previous Claims       1393169 non-null  float64
 11  Vehicle Age           2000000 non-null  float64
 12  Credit Score          2000000 non-null  float64
 13  Insurance Duration    2000000 non-null  float64
 14  Policy Start Date     2000000 non-null  

In [9]:
# invert 'Policy Start Date' to 'Policy Start Month'
df1 = df.copy()
df1['Policy Start Date'] = pd.to_datetime(df1['Policy Start Date'])
df1['Policy Start Month'] = pd.DatetimeIndex(df1['Policy Start Date']).month
df1['Policy Start Month'].astype('object')
df['Policy Start Month'] = df1['Policy Start Month'].astype('object')
del df['Policy Start Date']
df.insert(14, 'Policy Start Month', df.pop('Policy Start Month'))

In [10]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   2000000 non-null  float64
 1   Gender                2000000 non-null  object 
 2   Annual Income         2000000 non-null  float64
 3   Marital Status        2000000 non-null  object 
 4   Number of Dependents  2000000 non-null  float64
 5   Education Level       2000000 non-null  object 
 6   Occupation            2000000 non-null  object 
 7   Health Score          2000000 non-null  float64
 8   Location              2000000 non-null  object 
 9   Policy Type           2000000 non-null  object 
 10  Previous Claims       1393169 non-null  float64
 11  Vehicle Age           2000000 non-null  float64
 12  Credit Score          2000000 non-null  float64
 13  Insurance Duration    2000000 non-null  float64
 14  Policy Start Month    2000000 non-null  

In [11]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [12]:
df = pd.get_dummies(df, dtype='int')

In [13]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 51 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   Age                          2000000 non-null  float64
 1   Annual Income                2000000 non-null  float64
 2   Number of Dependents         2000000 non-null  float64
 3   Health Score                 2000000 non-null  float64
 4   Previous Claims              1393169 non-null  float64
 5   Vehicle Age                  2000000 non-null  float64
 6   Credit Score                 2000000 non-null  float64
 7   Insurance Duration           2000000 non-null  float64
 8   Premium Amount               1200000 non-null  float64
 9   Gender_Female                2000000 non-null  int64  
 10  Gender_Male                  2000000 non-null  int64  
 11  Marital Status_Divorced      2000000 non-null  int64  
 12  Marital Status_Married       2000000 non-null  i

In [14]:
# make a regression model to predict previous claims
df_previous_claims = df[df['Previous Claims'].notna()]
df_no_previous_claims = df[df['Previous Claims'].isna()]

del df_previous_claims['Premium Amount']
del df_no_previous_claims['Premium Amount']

# train_test split
from sklearn.model_selection import train_test_split
train_set, test_set_from_train = train_test_split(df_previous_claims, test_size=0.2, random_state=42)

# split target and features
X_train = df_previous_claims.drop('Previous Claims', axis=1)
y_train = df_previous_claims['Previous Claims'].copy()

# build model
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from catboost import CatBoostRegressor as Catboost

voting_reg = VotingRegressor(
    estimators=[
        ('lin_reg', LinearRegression()),
        ('xgboost', xgb.XGBRegressor(max_depth=6, n_estimators=40, random_state=42)),
        ('catboost',Catboost(iterations=10, depth=8, learning_rate=1, random_state=42)),
    ]
)
voting_reg.fit(X_train, y_train)

test_set = df_no_previous_claims.copy()
del test_set['Previous Claims']
y_pred = voting_reg.predict(test_set)

# fill nan of df['Previous Claims'] with y_pred
df.loc[df['Previous Claims'].isna(), 'Previous Claims'] = y_pred

0:	learn: 0.9679402	total: 295ms	remaining: 2.65s
1:	learn: 0.9662059	total: 722ms	remaining: 2.89s
2:	learn: 0.9647735	total: 1.2s	remaining: 2.79s
3:	learn: 0.9645102	total: 1.62s	remaining: 2.43s
4:	learn: 0.9641904	total: 2.08s	remaining: 2.08s
5:	learn: 0.9639432	total: 2.49s	remaining: 1.66s
6:	learn: 0.9638254	total: 2.9s	remaining: 1.24s
7:	learn: 0.9636999	total: 3.34s	remaining: 834ms
8:	learn: 0.9635398	total: 3.8s	remaining: 422ms
9:	learn: 0.9634163	total: 4.3s	remaining: 0us


In [15]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2000000 entries, 0 to 799999
Data columns (total 51 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   Age                          2000000 non-null  float64
 1   Annual Income                2000000 non-null  float64
 2   Number of Dependents         2000000 non-null  float64
 3   Health Score                 2000000 non-null  float64
 4   Previous Claims              2000000 non-null  float64
 5   Vehicle Age                  2000000 non-null  float64
 6   Credit Score                 2000000 non-null  float64
 7   Insurance Duration           2000000 non-null  float64
 8   Premium Amount               1200000 non-null  float64
 9   Gender_Female                2000000 non-null  int64  
 10  Gender_Male                  2000000 non-null  int64  
 11  Marital Status_Divorced      2000000 non-null  int64  
 12  Marital Status_Married       2000000 non-null  i

In [25]:
train = df.iloc[:len(train)]
test = df.iloc[len(train):]

In [26]:
train.to_csv('04_train.csv', index=False)
test.to_csv('04_test.csv', index=False)

In [22]:
df1 = df.copy()

In [23]:
# reduce dimentionality
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

X = df1.drop('Premium Amount', axis=1)
y = df1['Premium Amount']

scaler = StandardScaler()

X = scaler.fit_transform(X)

pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [24]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)
pca.n_components_

37

In [28]:
X_reduced.shape

(2000000, 37)

In [32]:
train = X_reduced[:len(train)]
test = X_reduced[len(train):]

# add y to train
train = np.concatenate((train, y[:len(train)].values.reshape(-1, 1)), axis=1)
train.shape

(1200000, 38)

In [34]:
train = pd.DataFrame(train)
test = pd.DataFrame(test)

train.to_csv('05_train.csv', index=False)
test.to_csv('05_test.csv', index=False)