In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score

In [18]:
url = "https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv"
df = pd.read_csv(url)
print(df.info())
missing_values = df.isnull().sum()
'''This way we can find out how many values are Not null'''
print("--- Missing Values Count ---")
print(missing_values)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None
--- Missing Values Count ---
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [22]:
df_clean = pd.get_dummies(df, drop_first=True)
df_clean['bmi_smoker'] = df_clean['bmi'] * df_clean['smoker_yes']
X = df_clean.drop('charges' , axis = 1)
y = df_clean['charges']

y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X,y_log, test_size= 0.2)



In [24]:
'''Here we shall use XGBoost, it's far better than Trees, because it works like this
YOu have dataset, model 1 will predict, obv you have some errors, model 2 will fix model 1s error and predict, now model 3 and so on
Sort of like golf, each hit brings you closer to the hole, each hit can be considered as a model, hit speed as learning rate and 
distance from hole is error'''

params = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xg_reg = xgb.XGBRegressor(objective='reg:squarederror' , n_jobs=-1, random_state = 42)

search = RandomizedSearchCV(
    estimator=xg_reg,
    param_distributions=params,
    n_iter=50,       # Try 50 combinations
    scoring='r2',    # We want to maximize R2
    cv=5,            # 5-Fold Cross Validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train,y_train)

print(f"Best Params: {search.best_params_}")
best_model = search.best_estimator_
y_pred_log = best_model.predict(X_test)

# Reverse log1p -> expm1
y_pred = np.expm1(y_pred_log)
y_test_actual = np.expm1(y_test)

final_r2 = r2_score(y_test_actual, y_pred)
print(f"\nFinal Test R2 Score: {final_r2:.4f}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Params: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 1.0}

Final Test R2 Score: 0.8597
