In [211]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans

from scipy.stats import boxcox

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [72]:
df = pd.read_csv('E:/Guvi_projects/Smart_premium/data/train.csv')

In [6]:
pd.set_option('display.max_columns', None)

In [34]:
df

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,Female,27316.0,Married,0.0,Master's,Unemployed,13.772907,Urban,Premium,,5.0,372.0,3.0,2023-05-03 15:21:39.257696,Poor,No,Daily,Apartment,1303.0
1199996,1199996,54.0,Male,35786.0,Divorced,,Master's,Self-Employed,11.483482,Rural,Comprehensive,,10.0,597.0,4.0,2022-09-10 15:21:39.134960,Poor,No,Weekly,Apartment,821.0
1199997,1199997,19.0,Male,51884.0,Divorced,0.0,Master's,,14.724469,Suburban,Basic,0.0,19.0,,6.0,2021-05-25 15:21:39.106582,Good,No,Monthly,Condo,371.0
1199998,1199998,55.0,Male,,Single,1.0,PhD,,18.547381,Suburban,Premium,1.0,7.0,407.0,4.0,2021-09-19 15:21:39.190215,Poor,No,Daily,Apartment,596.0


In [203]:
df1 = df.copy()

df1['Age'] = df1.groupby('Education Level')['Age'].transform(lambda x: x.fillna(x.median()))

df1['Number of Dependents'] = df1.groupby(['Age','Education Level'])['Number of Dependents'].transform(lambda x: x.fillna(x.mode()[0]))

df1['Annual Income'] = df1.groupby(['Education Level','Age','Number of Dependents'])['Annual Income'].transform(lambda x: x.fillna(x.median()))

kmeans = KMeans(n_clusters=5, random_state=42)
df1['Income Groups'] = kmeans.fit_predict(df1[['Annual Income']])


df1['Marital Status'] = df1.groupby(['Age','Education Level'])['Marital Status'].transform(lambda x: x.fillna(x.mode()[0]))

df1['Occupation'] = df1.groupby(['Income Groups', 'Gender', 'Age', 'Education Level', 'Number of Dependents'])['Occupation'].transform(lambda x: x.fillna(x.mode()[0]))

df1['Health Score'] = df1.groupby(['Age', 'Education Level', 'Income Groups'])['Health Score'].transform(lambda x: x.fillna(x.median()))

df1['Previous Claims'] = df1.groupby(['Income Groups', 'Age', 'Number of Dependents'])['Previous Claims'].transform(lambda x:x.fillna(x.mode()[0]))

df1['Vehicle Age'] = df1['Vehicle Age'].fillna(df1['Vehicle Age'].mode()[0])

df1['Credit Score'] = df1.groupby(['Income Groups', 'Occupation', 'Number of Dependents'])['Credit Score'].transform(lambda x:x.fillna(x.median()))

df1['Customer Feedback'] = df1.groupby(['Policy Type', 'Previous Claims'])['Customer Feedback'].transform(lambda x:x.fillna(x.mode()[0]))

mode = df1[df1['Premium Amount'] == 1040.0]['Insurance Duration'].mode()[0]

df1['Insurance Duration'] = df1['Insurance Duration'].fillna(mode)

nominal_cols = ['Gender', 'Marital Status', 'Location',]
ohe = OneHotEncoder(sparse_output=False)

encoded = ohe.fit_transform(df[nominal_cols])

encoded_df = pd.DataFrame(data=encoded, columns=ohe.get_feature_names_out(nominal_cols))

df1 = pd.concat([df1.drop(nominal_cols, axis = 1), encoded_df], axis = 1)

edu_order = {'High School' : 0, "Bachelor's" : 1, "Master's" : 2, 'PhD' : 3}
df1['Education Level'] = df1['Education Level'].map(edu_order)

occ_order = {'Unemployed' : 0, 'Employed' : 2, 'Self-Employed' : 1}
df1['Occupation'] = df1['Occupation'].map(occ_order)

policy_order = {'Basic' : 0, 'Comprehensive' : 1, 'Premium' : 2}
df1['Policy Type'] = df1['Policy Type'].map(policy_order)

policy_order = {'Poor' : 0, 'Average' : 1, 'Good' : 2}
df1['Customer Feedback'] = df1['Customer Feedback'].map(policy_order)

smoke_pref = {'No' : 0, 'Yes' : 1}
df1['Smoking Status'] = df1['Smoking Status'].map(smoke_pref)

exer_pref = {'Rarely' : 0, 'Monthly' : 1, 'Weekly' : 2, 'Daily' : 3}
df1['Exercise Frequency'] = df1['Exercise Frequency'].map(exer_pref)


property_order = {'Apartment' : 0, 'Condo' : 1, 'House' : 2}
df1['Property Type'] = df1['Property Type'].map(property_order)

df1['Annual_income'], fitted_lambda = boxcox(df1['Annual Income'])

df1.drop(['Income Groups','id', 'Policy Start Date', 'Annual Income'], axis = 1, inplace=True)

In [204]:
df1.head()

Unnamed: 0,Age,Number of Dependents,Education Level,Occupation,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Gender_Female,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_nan,Location_Rural,Location_Suburban,Location_Urban,Annual_income
0,19.0,1.0,1,1,22.598761,2,2.0,17.0,372.0,5.0,0,0,2,2,2869.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,43.296957
1,39.0,3.0,2,2,15.569731,1,1.0,12.0,694.0,2.0,1,1,1,2,1483.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,60.998984
2,23.0,3.0,0,1,47.177549,2,1.0,14.0,622.0,3.0,2,1,2,2,567.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,57.273577
3,21.0,2.0,1,0,10.938144,0,1.0,0.0,367.0,1.0,0,1,3,0,765.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,94.539816
4,21.0,1.0,1,1,20.376094,2,0.0,8.0,598.0,4.0,0,1,2,2,2022.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,65.173499


In [206]:
X = df1.drop('Premium Amount', axis = 1)
y = df1['Premium Amount']

In [207]:
scaler = StandardScaler()

X = scaler.fit_transform(X)


In [213]:
models = [LinearRegression(), Lasso(), Ridge(), ElasticNet(), HuberRegressor(), DecisionTreeRegressor(max_depth=8), RandomForestRegressor(n_estimators=100),
          GradientBoostingRegressor(n_estimators=100), ExtraTreesRegressor(n_estimators=100), XGBRegressor(), CatBoostRegressor(verbose=0), LGBMRegressor(n_estimators=100)]

In [217]:
model = ExtraTreesRegressor(n_estimators=10)
model.fit(X,y)
y_pred = model.predict(X)

mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
rmse = mse ** (0.5)
r2score = r2_score(y, y_pred)

print(f'******{type(model).__name__}******\n')
print('Model Performance for Training Set\n')
print(f'Mean Squared Error ---> {mse}')
print(f'Mean Absolute Error ---> {mae}')
print(f'Root Mean Squared Error ---> {rmse}')
print(f'R2 Score ---> {r2score}\n\n')

MemoryError: could not allocate 268435456 bytes