In [47]:
import pandas as pd

data = pd.read_csv('./insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
# Checking if there is Missing Data
data.isna().sum()
# Handle Missing Values
data.dropna(inplace=True)

In [49]:
# Outliers
def remove_outliers (df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_limit = q1 - 1.5 * iqr
    upper_limit = q3 + 1.5 *iqr
    return df[(df[column] >= lower_limit) & (df[column] <= upper_limit)]

# Remove Outliers
for col in ['age', 'bmi', 'charges']:
    data = remove_outliers(data,col)


In [50]:
# Categorical Features
data = pd.get_dummies(data, columns=['sex', 'smoker', 'region'], dtype= int)
data.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [51]:
# Scaler using min-max
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numeric_cols = ['age', 'bmi']
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])


In [52]:
# Training and Testing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X = data.drop(columns=['charges'])
y = data['charges']

# Train Linear Regression Model
sum_train_error = 0
sum_test_error = 0
epochs = 100

train_errors = []
test_errors = []

for k in range(epochs):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    linReg = LinearRegression()
    linReg.fit(X_train, y_train)

    y_train_pred = linReg.predict(X_train)
    train_accuracy = mean_squared_error(y_train, y_train_pred) ** 0.5
    sum_train_error += train_accuracy
    train_errors.append(train_accuracy)  

    # Test Error (RMSE)
    y_test_pred = linReg.predict(X_test)
    test_accuracy = mean_squared_error(y_test, y_test_pred) ** 0.5
    sum_test_error += test_accuracy
    test_errors.append(test_accuracy)  
    # print('test error =', accuracy)
print('Sum Train Error =',sum_train_error / epochs)
print('Sum Test Error =', sum_test_error / epochs)

Sum Train Error = 4505.347790646084
Sum Test Error = 4536.936437708067


In [53]:
# Train Random Forest 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

for k in range(5, 20):
    rf = RandomForestRegressor(n_estimators=200, max_depth=k, max_samples=0.63, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_train)
    error = mean_squared_error(y_train, y_pred)
    train_error = error*0.5

    y_pred = linReg.predict(X_test)
    error = mean_squared_error(y_test, y_pred)
    test_error = error*0.5

    print('train error=', train_error, 'test error=', test_error)

train error= 6887074.315375724 test error= 20235234.84568521
train error= 5910422.660391162 test error= 20235234.84568521
train error= 4862460.117064126 test error= 20235234.84568521
train error= 4012829.0868050265 test error= 20235234.84568521
train error= 3395217.518207851 test error= 20235234.84568521
train error= 3024358.3311416768 test error= 20235234.84568521
train error= 2791932.4921146813 test error= 20235234.84568521
train error= 2768066.6080942494 test error= 20235234.84568521
train error= 2684586.137184235 test error= 20235234.84568521
train error= 2635648.3499002876 test error= 20235234.84568521
train error= 2689131.3994551515 test error= 20235234.84568521
train error= 2680465.8307230077 test error= 20235234.84568521
train error= 2671163.546223867 test error= 20235234.84568521
train error= 2665287.601770853 test error= 20235234.84568521
train error= 2661039.530412192 test error= 20235234.84568521


In [54]:
# Train Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

for d in range(2, 6):
    poly = PolynomialFeatures(degree=d)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)
    
    poly_train_preds = poly_model.predict(X_train_poly)
    poly_test_preds = poly_model.predict(X_test_poly)

    train_error = mean_squared_error(y_train, poly_train_preds) ** 0.5
    test_error = mean_squared_error(y_test, poly_test_preds) ** 0.5

    print('train error=', train_error, 'test error=', test_error)


train error= 4036.4228626039812 test error= 6169.513369023004
train error= 3878.7388224005254 test error= 6332.223859044846
train error= 3682.8206655604763 test error= 6486.1092210678235
train error= 3372.579185545319 test error= 66300.97080479836
