In [0]:
import os

INSURANCE_PATH = os.path.join("datasets","insurance") #For desktop

In [0]:
import numpy as np
import pandas as pd

def load_housing_data(insurance_path=INSURANCE_PATH):
    csv_path = os.path.join(insurance_path, "insurance.csv")
    return pd.read_csv(csv_path, skiprows=75)

In [0]:
insurance = load_housing_data()
insurance.head()

ParserError: ignored

Check for null values

In [0]:
insurance.info()

No null values. Categories `sex`, `smoker`, and `region` are categorical. The `charges` column is the response variable. Encode categorical values using `pd.get_dummies()`.

In [0]:
insurance_cat = insurance[['sex','smoker','region']]
insurance = insurance.drop(labels=['sex','smoker','region'], axis=1)
insurance = insurance.join(pd.get_dummies(insurance_cat))
insurance.head()

Check correlations.

In [0]:
corr_matrix = insurance.corr()
corr_matrix['charges'].sort_values(ascending=False)

Split data into train and test sets.

In [0]:
from sklearn.model_selection import train_test_split

(insurance_train, insurance_test) = train_test_split(insurance, test_size=0.2, random_state=42)

Get train/test set labels.

In [0]:
insurance_train_labels = insurance_train[['charges']]
insurance_train = insurance_train.drop(labels=['charges'], axis=1)

insurance_test_labels = insurance_test[['charges']]
insurance_test = insurance_test.drop(labels=['charges'], axis=1)

Try linear model.

In [0]:
rainfrom sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(insurance_train, insurance_train_labels)

Try on some test data.

In [0]:
some_data = insurance_test.iloc[:5]
some_data_labels = insurance_test_labels.iloc[:5]
print("Predictions:", lin_reg.predict(some_data))
print("Actual:", some_data_labels)

Calculate error (RMSE).

In [0]:
from sklearn.metrics import mean_squared_error

insurance_predictions = lin_reg.predict(insurance_test)
lin_mse = mean_squared_error(insurance_test_labels, insurance_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

Try another model, DecisionTreeRegressor.

In [0]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(insurance_train, insurance_train_labels)

Calculate RMSE.

In [0]:
insurance_predictions = tree_reg.predict(insurance_test)
tree_mse = mean_squared_error(insurance_test_labels, insurance_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

Try `RandomForestRegressor` model.

In [0]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42, n_estimators=100)
forest_reg.fit(insurance_train, insurance_train_labels)

Evaluate each model with cross-validation.

In [0]:
from sklearn.model_selection import cross_val_score

tree_scores = cross_val_score(tree_reg, insurance_train, insurance_train_labels, 
                              scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)

lin_scores = cross_val_score(lin_reg, insurance_train, insurance_train_labels,
                            scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

forest_scores = cross_val_score(forest_reg, insurance_train, insurance_train_labels,
                               scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

In [0]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
print("Tree:")
display_scores(tree_rmse_scores)

print("\nLinear:")
display_scores(lin_rmse_scores)

print("\nForest:")
display_scores(forest_rmse_scores)

Best model so far is `RandomForestRegressor` with mean error of $5106.11. Now, fine-tune this model's hyperparameters.

In [0]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # Try 18 combinations of n_estimators & max_features
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8, 10, 11]},
    # Try 6 combinations with bootstrap set to false
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring="neg_mean_squared_error", return_train_score=True)
grid_search.fit(insurance_train, insurance_train_labels)

In [0]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [0]:
grid_search.best_params_

In [0]:
grid_search.best_estimator_

Evaluate model with the best hyperparameters on the test data.

In [0]:
forest_reg_best = grid_search.best_estimator_

insurance_predictions = forest_reg_best.predict(insurance_test)
forest_best_mse = mean_squared_error(insurance_test_labels, insurance_predictions)
forest_best_rmse = np.sqrt(forest_best_mse)

print("Forest RMSE:", forest_best_rmse)

The best model has an error of $4570.18.