In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA

In [None]:
df = pd.read_csv("./insurance.csv")
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,insurance_cost
0,18,male,33.77,1,no,southeast,1725.5523
1,18,male,34.1,0,no,southeast,1137.011
2,18,female,26.315,0,no,northeast,2198.18985
3,18,female,38.665,2,no,northeast,3393.35635
4,18,female,35.625,0,no,northeast,2211.13075


In [None]:
df.tail()

Unnamed: 0,age,gender,bmi,children,smoker,region,insurance_cost
1333,64,female,31.825,2,no,northeast,16069.08475
1334,64,female,26.885,0,yes,northwest,29330.98315
1335,64,male,26.41,0,no,northeast,14394.5579
1336,64,male,36.96,2,yes,southeast,49577.6624
1337,64,male,23.76,0,yes,southeast,26926.5144


In [None]:
df['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [None]:
df.isna().sum()

age               0
gender            0
bmi               0
children          0
smoker            0
region            0
insurance_cost    0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,age,bmi,children,insurance_cost
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [None]:
X = df.drop('insurance_cost', axis=1)
y = df['insurance_cost']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numeric_features = ['age', 'bmi', 'children']
categorical_features = ['gender', 'smoker', 'region']

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
regressors = {
    'Random Forest': (RandomForestRegressor(), {'regressor__n_estimators': [10, 50, 100]}),
    'Linear Regression': (LinearRegression(), {}),
    'SVR': (SVR(), {'regressor__kernel': ['linear', 'rbf'], 'regressor__C': [0.1, 1, 10]})
}

In [None]:
results = {}
for name, (regressor, params) in regressors.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', regressor)])
    grid_search = GridSearchCV(pipe, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'best_model': best_model, 'mse': mse, 'mae': mae, 'r2': r2}

In [None]:
results_df = pd.DataFrame(results).T
print(results_df)

                                                          best_model  \
Random Forest      (ColumnTransformer(transformers=[('num',\n    ...   
Linear Regression  (ColumnTransformer(transformers=[('num',\n    ...   
SVR                (ColumnTransformer(transformers=[('num',\n    ...   

                                mse          mae        r2  
Random Forest       19693165.270989  2519.368567  0.860578  
Linear Regression   36000021.710749  4307.778697  0.745131  
SVR                121162192.371116  5575.650037  0.142208  


Overall, Random Forest seems to be the most suitable regression algorithm for predicting insurance costs in this dataset, as it provides the most accurate predictions. Its ensemble nature allows it to handle complex relationships between features and the target variable effectively. Linear Regression, while simpler, can still provide reasonable results but may struggle with capturing non-linear relationships present in the data. SVR, although capable of handling non-linear data, didn't exhibit superior performance compared to Random Forest and Linear Regression in this specific scenario.

In [None]:
regressors = {
    'Random Forest': (RandomForestRegressor(), {'regressor__n_estimators': [10, 50, 100]}),
    'Linear Regression': (LinearRegression(), {}),
    'SVR': (SVR(), {'regressor__kernel': ['linear', 'rbf'], 'regressor__C': [0.1, 1, 10]})
}

In [None]:
dimensionality_reduction_methods = {
    't-SNE': TSNE(),
    'ICA': FastICA()
}

In [None]:
results = {}
for name, (regressor, params) in regressors.items():
    for dr_name, dr_method in dimensionality_reduction_methods.items():
        if dr_name == 't-SNE':
            dr_method = 'passthrough'
        pipe = Pipeline(steps=[('preprocessor', preprocessor), ('reduce_dim', dr_method), ('regressor', regressor)])
        grid_search = GridSearchCV(pipe, params, cv=5, n_jobs=-1, error_score='raise')
        try:
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results[f'{name} with {dr_name}'] = {'best_model': best_model, 'mse': mse, 'mae': mae, 'r2': r2}
        except Exception as e:
            print(f"An error occurred for {name} with {dr_name}: {str(e)}")



In [None]:
results_df = pd.DataFrame(results).T
print(results_df)

                                                                     best_model  \
Random Forest with t-SNE      (ColumnTransformer(transformers=[('num',\n    ...   
Random Forest with ICA        (ColumnTransformer(transformers=[('num',\n    ...   
Linear Regression with t-SNE  (ColumnTransformer(transformers=[('num',\n    ...   
Linear Regression with ICA    (ColumnTransformer(transformers=[('num',\n    ...   
SVR with t-SNE                (ColumnTransformer(transformers=[('num',\n    ...   
SVR with ICA                  (ColumnTransformer(transformers=[('num',\n    ...   

                                           mse          mae        r2  
Random Forest with t-SNE        19715231.38995   2546.71986  0.860422  
Random Forest with ICA         24774616.264497  3046.429225  0.824603  
Linear Regression with t-SNE   36000021.710749  4307.778697  0.745131  
Linear Regression with ICA     35861701.407376  4268.054025   0.74611  
SVR with t-SNE                121162192.371116  5575.65003

Random Forest regression consistently outperformed Linear Regression and Support Vector Regression (SVR) across all dimensionality reduction techniques. It showcased lower mean squared error (MSE), mean absolute error (MAE), and higher R2 scores, indicating its superiority in capturing the relationships within the data.

The choice of dimensionality reduction method influenced model performance. t-SNE yielded better results compared to Independent Component Analysis (ICA) across all regression algorithms. This suggests that t-SNE was more effective in capturing the underlying structure of the data while reducing dimensionality, leading to improved model performance.

SVR performed poorly in this experiment, exhibiting significantly higher MSE and MAE values and negative R2 scores. This indicates that SVR struggled to effectively capture the relationships within the data and generalize to unseen samples. Further investigation and fine-tuning of SVR parameters may be necessary to improve its performance.

Based on these findings, employing Random Forest regression with t-SNE for dimensionality reduction would be the recommended approach for predicting insurance costs in this dataset. This combination consistently demonstrated the best performance metrics, providing more accurate predictions compared to other models and dimensionality reduction techniques.