In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt
import joblib
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

pd.pandas.set_option('display.max_columns', None)

In [2]:
from sklearn .linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [3]:
import xgboost as xgb

In [4]:
X_train = pd.read_csv('../processed/xtrain.csv')
X_test = pd.read_csv('../processed/xtest.csv')

In [5]:
X_train.head()

Unnamed: 0,age,bmi,bloodpressure,children,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
0,0.52381,0.314208,0.579851,0.2,0.0,1.0,0.0,0.0,0.0,1.0
1,0.714286,0.721311,0.438892,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2,0.97619,0.478142,0.558919,0.4,0.0,0.0,0.0,1.0,0.0,0.0
3,0.190476,0.311475,0.438892,0.6,1.0,0.0,0.0,0.0,0.0,0.0
4,0.928571,0.289617,0.135866,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
y_train = pd.read_csv("../processed/ytrain.csv")
y_test = pd.read_csv("../processed/ytest.csv")

In [7]:
y_train.head()

Unnamed: 0,claim
0,8.517963
1,8.804811
2,9.414887
3,9.06024
4,8.021604


In [8]:
y_test.head()

Unnamed: 0,claim
0,8.990525
1,9.132325
2,8.759913
3,9.791377
4,9.564836


In [9]:
def eval(model, x, y):
    pred = model.predict(x)
    
    mse = int(mean_squared_error(np.exp(y), np.exp(pred)))
    print('train mse: {}'.format(mse))
    
    rmse = int(mean_squared_error(np.exp(y), np.exp(pred), squared=False))
    print('train rmse: {}'.format(rmse))
    
    r2 = r2_score(np.exp(y), np.exp(pred))
    print('train r2: {}'.format(r2))
    print()
    
    return rmse, r2

In [10]:
model_result = []

## Linear Regression

In [11]:
linreg_model = LinearRegression()
linreg_model.fit(X_train, y_train)

In [12]:
rmse, r2 = eval(linreg_model, X_test, y_test)

train mse: 55890371
train rmse: 7475
train r2: 0.5932743957298937



In [13]:
model_result.append([rmse, r2, 'linear regression'])

## Random forest

In [14]:
randforest_model = RandomForestRegressor(random_state=0)
randforest_model.fit(X_train, y_train.values.ravel())

In [15]:
rmse, r2 = eval(randforest_model, X_test, y_test)

train mse: 36136232
train rmse: 6011
train r2: 0.7370292817211925



In [16]:
model_result.append([rmse, r2, 'Random forest'])

## KNN

In [17]:
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

In [18]:
rmse, r2 = eval(knn_model, X_test, y_test)

train mse: 57894765
train rmse: 7608
train r2: 0.5786880116423005



In [19]:
model_result.append([rmse, r2, 'KNearest Neighbor'])

## XGBoost

In [20]:
xgb_r = xgb.XGBRegressor(objective ='reg:squarederror', random_state=0)
xgb_r.fit(X_train, y_train)

In [21]:
rmse, r2 = eval(xgb_r, X_test, y_test)

train mse: 47300173
train rmse: 6877
train r2: 0.6557870123420813



In [22]:
model_result.append([rmse, r2, 'XGBoost'])

## result comparison

In [23]:
result = pd.DataFrame(model_result,columns=['rmse', 'r2_score', 'model'])

In [24]:
result.head()

Unnamed: 0,rmse,r2_score,model
0,7475,0.593274,linear regression
1,6011,0.737029,Random forest
2,7608,0.578688,KNearest Neighbor
3,6877,0.655787,XGBoost


In [25]:
result.sort_values(by=['r2_score'], ascending=False, inplace=True)

In [26]:
result.head()

Unnamed: 0,rmse,r2_score,model
1,6011,0.737029,Random forest
3,6877,0.655787,XGBoost
0,7475,0.593274,linear regression
2,7608,0.578688,KNearest Neighbor


random forest has got best result 