In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# plt.rcParams.update({'figure.max_open_warning': 0}) # Removes max 20 graph limit
import seaborn as sns
import time as time

# EDA
import plotly.express as px
from sklearn.feature_selection import chi2, RFECV
import scipy.stats as ss
from pandas_profiling import ProfileReport
# import phik
# from phik import resources, report

# Evalutation Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error ,r2_score ,mean_squared_log_error,mean_absolute_percentage_error

# Date preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer ,SimpleImputer ,IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler ,OneHotEncoder,FunctionTransformer , MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, learning_curve ,cross_val_score ,GridSearchCV,KFold,RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.decomposition import PCA

#Models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso ,SGDRegressor ,ElasticNet ,BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.svm import LinearSVR ,SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor

ModuleNotFoundError: No module named 'pandas_profiling'

In [4]:
df_raw = pd.read_csv('insurance.csv')
df_raw.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
# Split dataset into Holdout sets and Training Set (Training set will be split into Train and validation after EDA)
train_df,test_df = train_test_split(df_raw,test_size=0.2,random_state=1)
print(f'Shape of train_df:{train_df.shape}\nShape of test_df:{test_df.shape}')

Shape of train_df:(1070, 7)
Shape of test_df:(268, 7)


# EDA

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(train_df, title="Pandas Profiling Report")
profile.to_widgets()
profile.to_file("your_report.html")

# Data processing

In [7]:
# Extract all numeric values
numeric = df_raw.select_dtypes(include=np.number).columns.tolist()
numeric

['age', 'bmi', 'children', 'charges']

In [8]:
scaler = StandardScaler()
df_scaled = df_raw.copy()
df_scaled[numeric] = scaler.fit_transform(df_scaled[numeric])
df_scaled.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,-1.438764,female,-0.45332,-0.908614,yes,southwest,0.298584
1,-1.509965,male,0.509621,-0.078767,no,southeast,-0.953689
2,-0.797954,male,0.383307,1.580926,no,southeast,-0.728675
3,-0.441948,male,-1.305531,-0.908614,no,northwest,0.719843
4,-0.513149,male,-0.292556,-0.908614,no,northwest,-0.776802


In [9]:
df_onehot = pd.get_dummies(df_scaled, drop_first=True)
df_onehot.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,-1.438764,-0.45332,-0.908614,0.298584,0,1,0,0,1
1,-1.509965,0.509621,-0.078767,-0.953689,1,0,0,1,0
2,-0.797954,0.383307,1.580926,-0.728675,1,0,0,1,0
3,-0.441948,-1.305531,-0.908614,0.719843,1,0,1,0,0
4,-0.513149,-0.292556,-0.908614,-0.776802,1,0,1,0,0


# Model Selection

We will now try out different models from different family of models and select the best candidate models. 

1. Linear Models ( Linear Regression , BayesianRidge , SGDRegressor )

2. Distance Based models (KNeighborsRegressor)

3. DecisionTree ( DecisionTreeRegressor )

4. Ensemble Tree Models (RandomForest, GradientBoosting)

5. SVMs with different (linear, rbf, poly, sigmoid) kernels

This will be done on the training set using cross validation.

In [None]:
## Quick Eval function to fit and score all models at once
def quick_evalutaion(models, X_train, X_test, y_train, y_test, metrics=["neg_mean_squared_error","neg_mean_absolute_error","neg_mean_absolute_percentage_error"],curve = True, cv = KFold(n_splits=5,shuffle=True,random_state=1)):
    hist = {}
    
    for idx, model in (enumerate(models)):
        try:
            clf = model(random_state=42) # Setting random_state for certain model
        except:
            clf = model()
        clf.fit(X_train, y_train)
        test_prediction = clf.predict(X_test) # Testing on validation dataset

        MAPE_test = mean_absolute_percentage_error(y_test, test_prediction)
        MAE_test = mean_absolute_error(y_test, test_prediction)
        MSE_test = mean_squared_error(y_test,test_prediction)

        # 5-Fold CV
        cv_hist = cross_validate(clf, X_train, y_train, scoring=metrics,verbose=1)

        # Record down the performance
        hist[model.__name__] = dict(
            # train_acc = acc_train,
            fit_time = cv_hist['fit_time'].mean(),
            score_time = cv_hist['score_time'].mean(),
            cv_MAE = cv_hist['test_neg_mean_absolute_error'].mean(),
            MAE_test_score = MAE_test,
            cv_MAPE = cv_hist['test_neg_mean_absolute_percentage_error'].mean(),
            MAPE_test_score = MAPE_test,
            cv_MSE = cv_hist['test_neg_mean_squared_error'].mean(),
            MSE_test_score = MSE_test
        )

        # Plotting the learning Curve of each Model using: neg_mean_squared_log_error
        if curve:
          fig, ax = plt.subplots(figsize=(10, 8))
          train_sizes = np.linspace(.1, 1.0, 10)
          train_sizes, train_scores, test_scores = learning_curve(clf, X_train, y_train, cv = cv, n_jobs = -1, train_sizes = train_sizes, scoring="neg_mean_absolute_error")
          scores = pd.DataFrame({
          "Train Sizes" : np.tile(train_sizes, train_scores.shape[1]),
          "Train Scores" : train_scores.flatten(),
          "Test Scores" : test_scores.flatten()
          }).melt(value_vars=["Train Scores", "Test Scores"], var_name="Score Type", value_name="Scores", id_vars=["Train Sizes"])
          # print(f"THis is train_sizes:\n{train_sizes}\n This is train_scores:\n{train_scores}\n This is test_scores:{test_scores}")
          sns.lineplot(data=scores, x="Train Sizes", y="Scores", hue="Score Type", ax = ax ,palette=['#3DD5E2', '#A045B5'])
          ax.set_title(f"Learning Curve of {model}")
          ax.set_ylabel("neg_mean_absolute_error")
          ax.set_xlabel("Train Sizes")
          plt.show()
        
    # plt.tight_layout()
    # display(pd.DataFrame(hist).T)

    results = pd.DataFrame(hist).T
    return results
    # plt.show()

In [None]:
# Tuned RandomForest Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}


In [10]:
model = RandomForestRegressor()

model.fit(X_train, y_train)
# Savemodel
import pickle
pickle.dump(model, open('model.pkl','wb'))


SyntaxError: invalid syntax (472428151.py, line 1)

In [None]:
# load from file
from joblib import load

model = load('/content/model.pkl')

# Extract one row from x_test
x_test = X_test.iloc[0:1]

# Predict on one sample
model.predict()

In [1]:
json = {
    "age": 1,
    "sex":"male",
    "bmi": 0.4,
    "children": 0.6,
    "smoker":"no",
    "region": "southeast"
}

In [3]:
# load from file
from joblib import load

model = load('model.pkl')


KeyError: 255

In [None]:
from sklearn.preprocessing import OneHotEncoder
 

def preProcess(json_data,main_df=df_raw):
    '''
    This function takes in a single json object as input (with all features)
    (Input must follow specific ordering)
    Performs data preprocessing which includes:
    - One hot encoding
    - Scaling
    Returns a single row dataframe with all features for prediction
    '''
    # Create a single row dataframe from the json object
    df = pd.DataFrame(json_data,index=[0])

    # Extract all numeric values
    numeric = df.select_dtypes(include=['number']).columns.tolist()

    # Scale numeric values
    scaler = StandardScaler()
    df_scaled = df.copy()
    scaler.fit(main_df[numeric])
    df_scaled[numeric] = scaler.transform(df_scaled[numeric])
    
    # One hot encode categorical values
    categorical = df.select_dtypes(exclude=['number']).columns.tolist()

    # one hot encode categorical values
    ohe = OneHotEncoder(sparse=False,handle_unknown='ignore',drop='first')
    ohe.fit(main_df[categorical])

    df_ohe = pd.DataFrame(ohe.transform(df[categorical]),columns=ohe.get_feature_names_out(categorical))
    # Concatenate the scaled numeric and one hot encoded categorical values
    df_final = pd.concat([df_scaled[numeric],df_ohe],axis=1)
    
    # Return the final dataframe and original dataframe
    return df_final, df

   

In [None]:
prediction_input, original = preProcess(json)

In [None]:
prediction = model.predict(prediction_input)
output = prediction[0]
print(output)