In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pip install category_encoders

In [None]:
pip install scikit_optimize

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import math
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.feature_selection import RFE

In [None]:
sys.path.append('/content/drive/MyDrive/Colab Notebooks/DS_PRACTICE/ml_pipeline')

In [None]:
from eda import plot_histograms, plot_univariate_numeric, plot_univariate_categorical, plot_heatmap, plot_paired_boxplots, plot_paired_scatterplots, plot_residuals, plot_pearson_wrt_target
from stats import chi2, anova
from model_performance import calc_model_performance, compare_model_performance, calc_preds_in_residual_range, calc_preds_in_residual_perc_range
from sklearn.metrics import mean_squared_error # Import mean_squared_error

def calc_model_performance(y_true, y_pred):
    results = {}
    results['Root Mean Squared Error'] = mean_squared_error(
        y_true, y_pred # Removed squared=False
    )
    results['Mean Absolute Error'] = mean_absolute_error(
        y_true, y_pred
    )
    results['R-squared'] = r2_score(
        y_true, y_pred
    )
    return pd.Series(results)

# Assuming these functions also need to be defined or imported if they are not in model_performance.py
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DS_PRACTICE/insurance.csv')

In [None]:
data.head()

# To Get The Datatype of the Columns

In [None]:
data.info()

In [None]:
data.isnull().sum()

No Null values so  no need to Impute...
The Target that we want to predict is the expenses column so Split the Dataset into --->
1.) Feature (X)
2.) Target (Y)

In [None]:
target  = 'charges'
X=data.drop(target,axis=1)
y=data[target]

In [None]:
X.shape, y.shape

((1338, 6), (1338,))

In [None]:
plot_histograms(X)

In [None]:
plot_histograms(pd.DataFrame(y), height=300)

In [None]:
plot_univariate_numeric(
    X.select_dtypes(include=np.number),
    y
)

In [None]:
plot_univariate_categorical(
    X[['sex', 'smoker', 'region', 'children']],
    y
)

In [None]:
plot_heatmap(
    X[['age', 'bmi', 'children']],
    y,
    bins=10
)

In [None]:
plot_paired_boxplots(
    X[['sex', 'smoker', 'region']],
    y
)

In [None]:
plot_paired_scatterplots(X, y)

In [None]:
px.scatter_matrix(
    X.select_dtypes(include=np.number)
)

In [None]:
px.imshow(X.select_dtypes(include=np.number).corr())

In [None]:
X_chi2 = chi2(X.select_dtypes(object))

In [None]:
X_chi2

In [None]:
X_chi2[X_chi2['p_value'] < 0.05]

In [None]:
X_anova = anova(X)

In [None]:
X_anova

In [None]:
X_anova[X_anova['p_value'] < 0.05]

In [None]:
plot_pearson_wrt_target(X, y)

In [None]:
data_anova = anova(data)
anova_wrt_target = data_anova[data_anova['num_column']=='charges']

In [None]:
anova_wrt_target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=42
)

In [None]:
cols_to_drop = [
    'children',
    'region',
    'sex'
]
X_train.drop(cols_to_drop, axis=1,inplace=True)
X_test.drop(cols_to_drop, axis=1,inplace=True)

In [None]:
ohe = OneHotEncoder(use_cat_names=True)
X_train = ohe.fit_transform(X_train)
X_test = ohe.transform(X_test)

In [None]:
cols_to_drop = ['smoker_no']
X_train.drop(cols_to_drop, axis=1,inplace=True)
X_test.drop(cols_to_drop,axis=1,inplace=True)


In [None]:
pt = PowerTransformer(method='yeo-johnson')
y_train_t = pt.fit_transform(y_train.values.reshape(-1,1))[:,0]
y_test_t = pt.transform(y_test.values.reshape(-1,1))[:,0]

In [None]:
pd.Series(y_train_t).hist(figsize=(5, 3))
pd.Series(y_test_t).hist(figsize=(5, 3))

# Linear Regression Model Training ---->

In [None]:
sample_weight = y_train/y_train.min()

In [None]:
lr = LinearRegression()
lr.fit(
    X_train,
    y_train_t,
    sample_weight=sample_weight
)

In [None]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

In [None]:
y_pred_train = pt.inverse_transform(y_pred_train.reshape(-1, 1))[:, 0]
y_pred_test = pt.inverse_transform(y_pred_test.reshape(-1, 1))[:, 0]

In [None]:
base_perf_train = calc_model_performance(y_train, y_pred_train)

In [None]:
base_perf_train

In [None]:
base_perf_test = calc_model_performance(y_test, y_pred_test)

In [None]:
base_perf_test

In [None]:
residuals_train = y_train - y_pred_train
residuals_test = y_test - y_pred_test

In [None]:
fig = sm.qqplot(
    residuals_train,
    fit=True,
    line='45'
)

In [None]:
fig = sm.qqplot(
    residuals_test,
    fit=True,
    line='45'
)

In [None]:
plot_residuals(y_true=y_train, y_pred=y_pred_train)

In [None]:
px.scatter(x=y_train, y=residuals_train)

In [None]:
px.scatter(x=y_test, y=residuals_test)

# **Non Linear Model**

# **Data Pre-Processing**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
      test_size=0.33,
      random_state=42
)

In [None]:
ohe = OneHotEncoder(use_cat_names=True)
X_train = ohe.fit_transform(X_train)
X_test = ohe.transform(X_test)

In [None]:
rfe = RFE(estimator=XGBRegressor())
xgb = XGBRegressor()

In [None]:
steps = [
    ('rfe', rfe),
    ('xgb', xgb)
]

In [None]:
pipe = Pipeline(steps)

In [None]:
num_features = X_train.shape[1]
search_spaces = {
    'rfe__n_features_to_select' : Integer(1,num_features),
    'xgb__n_estimators' : Integer(1,500),
    'xgb__max_depth' : Integer(2,8),
    'xgb__reg_lambda' : Integer(1,200),
    'xgb__learning_rate' : Real(0,1),
    'xgb__gamma' : Real(0,2000)
}

In [None]:
xgb_bs_cv = BayesSearchCV(
    estimator = pipe,
    search_spaces = search_spaces,
    scoring  = 'neg_root_mean_squared_error',
    n_iter  = 75,
    cv = 3,
    n_jobs = -1,
    verbose = 1,
    random_state = 0
)

In [None]:
xgb_bs_cv.fit(
    X_train,
    y_train
)

# Model Evaluation

In [None]:
cv_results = pd.DataFrame(xgb_bs_cv.cv_results_).sort_values('rank_test_score')

In [None]:
cv_results

In [None]:
y_pred_train_xgb = xgb_bs_cv.predict(X_train)
y_pred_test_xgb = xgb_bs_cv.predict(X_test)

In [None]:
xgb_perf_train = calc_model_performance(y_train, y_pred_train_xgb)

In [None]:
xgb_perf_train

In [None]:
xgb_perf_test = calc_model_performance(y_test, y_pred_test_xgb)

In [None]:
xgb_perf_test

In [None]:
perf_comp_train = compare_model_performance(base_perf_train, xgb_perf_train)
perf_comp_test = compare_model_performance(base_perf_test,xgb_perf_test)

In [None]:
perf_comp_train

In [None]:
perf_comp_test

In [None]:
calc_preds_in_residual_range(
    y_true=y_test,
    y_pred=y_pred_test_xgb,
    range_=2000
)

In [None]:
calc_preds_in_residual_perc_range(
    y_true=y_test,
    y_pred=y_pred_test_xgb,
    perc_range=20
)