In [1]:
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv("../.env")

url = os.getenv("local_ins_ds")

df_orig = pd.read_csv(url)

df = df_orig.copy()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [2]:
df = pd.concat([df, pd.get_dummies(df['sex'], prefix='sex', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df['smoker'], prefix='smok', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df['region'], prefix='reg', drop_first=True)], axis=1)

In [3]:
df = df.drop(columns=['sex', 'smoker', 'region'])

In [4]:
x = df.copy().drop(columns = ['charges'])
y = df['charges'].copy()

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_scaled = scaler.fit_transform(x)
y_scaled = (y - y.min())/(y.max() - y.min())

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.2, random_state=42)

In [7]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
y_pred_train = lin_reg.predict(X_train)
y_pred_test = lin_reg.predict(X_test)

In [9]:
import numpy as np

In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

results = []
results.append({
    "Model": "Linear Regression",
    "MAE": mean_absolute_error(y_test, y_pred_test),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
    "R2": r2_score(y_test, y_pred_test)
})

In [11]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)

y_pred_train = svr.predict(X_train)
y_pred_test = svr.predict(X_test)

In [12]:
mae = mean_absolute_error(y_test, y_pred_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2 = r2_score(y_test, y_pred_test)

results.append({
    "Model": "SVR (rbf)",
    "MAE": mae,
    "RMSE": rmse,
    "R2": r2
})

In [13]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_test = rf.predict(X_test)

results.append({
    "Model": "Random Forest",
    "MAE": mean_absolute_error(y_test, y_pred_test),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
    "R2": r2_score(y_test, y_pred_test)
})

In [14]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred_test = gb.predict(X_test)

results.append({
    "Model": "Gradient Boosting",
    "MAE": mean_absolute_error(y_test, y_pred_test),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
    "R2": r2_score(y_test, y_pred_test)
})

In [15]:
xgb = XGBRegressor(random_state=42, verbosity=0)
xgb.fit(X_train, y_train)
y_pred_test = xgb.predict(X_test)

results.append({
    "Model": "XGBoost",
    "MAE": mean_absolute_error(y_test, y_pred_test),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
    "R2": r2_score(y_test, y_pred_test)
})

In [16]:
cat = CatBoostRegressor(random_state=42, verbose=0)
cat.fit(X_train, y_train)
y_pred_test = cat.predict(X_test)

results.append({
    "Model": "CatBoost",
    "MAE": mean_absolute_error(y_test, y_pred_test),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
    "R2": r2_score(y_test, y_pred_test)
})

In [17]:
X_train_df = pd.DataFrame(X_train, columns=X_train.columns if hasattr(X_train, 'columns') else [f'feat_{i}' for i in range(X_train.shape[1])])
X_test_df = pd.DataFrame(X_test, columns=X_train_df.columns)

lgb = LGBMRegressor(random_state=42, verbose=-1)
lgb.fit(X_train_df, y_train)

y_pred_test = lgb.predict(X_test_df)
y_pred_train = lgb.predict(X_train_df)

results.append({
    "Model": "LightGBM",
    "MAE": mean_absolute_error(y_test, y_pred_test),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
    "R2": r2_score(y_test, y_pred_test)
})

In [18]:
df_results = pd.DataFrame(results)
df_results.sort_values(by='R2', ascending=False, inplace=True)
print(df_results)

               Model       MAE      RMSE        R2
3  Gradient Boosting  0.039117  0.069256  0.878741
2      Random Forest  0.040598  0.072692  0.866414
5           CatBoost  0.041632  0.072734  0.866256
6           LightGBM  0.041640  0.072824  0.865925
4            XGBoost  0.043316  0.076838  0.850739
0  Linear Regression  0.066740  0.092521  0.783593
1          SVR (rbf)  0.093298  0.103401  0.729699
