In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [2]:
df = pd.read_csv("cleaned_insurance_data_2.csv")

df.drop(["Unnamed: 0", "sex"], inplace=True, axis=1)

In [3]:
df = df[['charges', 'smoker', 'age', 'children', 'bmi', 'region']]

In [4]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'under_weight'
    elif 18.5 <= bmi < 25:
        return 'normal_weight'
    elif 25 <= bmi < 30:
        return 'over_weight'
    else:
        return 'obese'

df['bmi_category'] = df['bmi'].apply(categorize_bmi)

In [5]:
df['child_stat'] = df['children'].apply(lambda x : 0 if x == 0 else 1)

In [6]:
# Feature Engineer

df['not_smoker'] = df["smoker"].apply(lambda x: 1 if x == 0 else 0)

df['age_children'] = df['age'] * df['children']

df['smoker_child'] = df['children'] * (df['smoker'] == 1)
df['non_smoker_child'] = df['children'] * (df['not_smoker'] == 1)

df['smoker_age'] = df['smoker'] * df['age']
df['not_smoker_age'] = df['not_smoker'] * df['age']

In [7]:
df.head(4)

Unnamed: 0,charges,smoker,age,children,bmi,region,bmi_category,child_stat,not_smoker,age_children,smoker_child,non_smoker_child,smoker_age,not_smoker_age
0,16884.92,1,19,0,27.9,southwest,over_weight,0,0,0,0,0,19,0
1,1725.55,0,18,1,33.77,southeast,obese,1,1,18,0,1,0,18
2,4449.46,0,28,3,33.0,southeast,obese,1,1,84,0,3,0,28
3,21984.47,0,33,0,22.7,northwest,normal_weight,0,1,0,0,0,0,33


In [10]:
X = df.drop("charges", axis=1)
y = df[["charges"]]

In [11]:
X = pd.get_dummies(X, columns=['region', 'bmi_category']).astype(int)

In [58]:
X.head(4)

Unnamed: 0,smoker,age,children,bmi,child_stat,not_smoker,age_children,smoker_child,non_smoker_child,smoker_age,not_smoker_age,region_northeast,region_northwest,region_southeast,region_southwest,bmi_category_normal_weight,bmi_category_obese,bmi_category_over_weight,bmi_category_under_weight
0,1,19,0,27,0,0,0,0,0,19,0,0,0,0,1,0,0,1,0
1,0,18,1,33,1,1,18,0,1,0,18,0,0,1,0,0,1,0,0
2,0,28,3,33,1,1,84,0,3,0,28,0,0,1,0,0,1,0,0
3,0,33,0,22,0,1,0,0,0,0,33,0,1,0,0,1,0,0,0


In [14]:
y.shape

(1337, 1)

In [15]:
X.shape

(1337, 19)

### Feature scaling and selection

In [16]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

minmax_scale = MinMaxScaler()

X_transformed = minmax_scale.fit_transform(X)

stand_scaler = StandardScaler()

X_scaled = stand_scaler.fit_transform(X_transformed)

In [17]:
from sklearn.feature_selection import f_regression

f_statistic, p_values = f_regression(X_scaled, y)

  y = column_or_1d(y, warn=True)


In [18]:
ver_features = ['smoker', 'age', 'children', 'bmi',	'child_stat', 'not_smoker', 
                'age_children',	'smoker_child',	'non_smoker_child', 'smoker_age',
                'not_smoker_age', 'region_northeast',	'region_northwest',	'region_southeast',
                'region_southwest',	'bmi_category_normal_weight', 'bmi_category_obese', 'bmi_category_over_weight', 'bmi_category_under_weight']

selected_features = pd.DataFrame({"features": ver_features,
                            "f_statistic": f_statistic,
                             "p-values": p_values})

selected_features.sort_values(by='f_statistic', ascending= False, inplace=True)

In [19]:
selected_features.reset_index(inplace=True, drop=True)

In [20]:
selected_features

Unnamed: 0,features,f_statistic,p-values
0,smoker_age,2205.359058,5.144833e-285
1,not_smoker,2175.736862,1.406722e-282
2,smoker,2175.736862,1.406722e-282
3,smoker_child,507.54062,1.625889e-95
4,not_smoker_age,332.768801,1.482141e-66
5,age,130.402967,6.975762e-29
6,non_smoker_child,81.225502,6.800093999999999e-19
7,bmi_category_obese,55.82607,1.425685e-13
8,bmi,53.467709,4.515294e-13
9,age_children,23.468005,1.418501e-06


In [21]:
X_model = X.copy()

In [85]:
# Define the columns to drop based on p-value threshold

# columns_to_drop = ['bmi_category_under_weight', 'region_northwest', 'region_southwest',
#                    'region_northeast', 'dependency_ratio']

# Drop the columns
# X_model = X_model.drop(columns=columns_to_drop, axis=1)

X_model_select = X_model[['smoker', 'age', 'smoker_child',
                           'bmi_category_obese', 'bmi_category_over_weight']]

# smoker_age
# not_smoker_age
# child_stat
# non_smoker_child

#['smoker', 'age', 'children', 'bmi',	'child_stat', 'not_smoker', 
#               'age_children',	'smoker_child',	'non_smoker_child', 'smoker_age',
#                'not_smoker_age', 'region_northeast',	'region_northwest',	'region_southeast',
 #               'region_southwest',	'bmi_category_normal_weight', 'bmi_category_obese', 'bmi_category_over_weight', 'bmi_category_under_weight']


In [87]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVR

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X_model_select, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age',  
                    'smoker_child',	
                    'bmi_category_obese', 'bmi_category_over_weight']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lsvr', LinearSVR())
])

# GridSearchCV

params = {'lsvr__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
              'lsvr__epsilon': np.arange(0.0, 15)
             }

grid = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Params: {'lsvr__epsilon': np.float64(1.0), 'lsvr__loss': 'squared_epsilon_insensitive'}
Best Score: 0.8449481978350433


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [88]:
grid.score(x_test, y_test)

0.9223038525117997

### KNN

In [103]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X_model_select, y, test_size=0.15, random_state=37, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age',  
                    'smoker_child',	
                    'bmi_category_obese', 'bmi_category_over_weight']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('knn', KNeighborsRegressor())
])

# GridSearchCV

params = {'knn__n_neighbors' : np.arange(1, 20),
              'knn__weights': ['uniform', 'distance'],
              'knn__metric': ['sokalmichener', 'cityblock', 'nan_euclidean', 'chebyshev',
                         'euclidean', 'manhattan', 'cosine'],
              'knn__p': np.arange(2, 8)
              }

grid_knn = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid_knn.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid_knn.best_params_)
print("Best Score:", grid_knn.best_score_)



Best Params: {'knn__metric': 'cosine', 'knn__n_neighbors': np.int64(16), 'knn__p': np.int64(2), 'knn__weights': 'uniform'}
Best Score: 0.8353859738635065


In [102]:
grid_knn.score(x_test, y_test)

0.8980802029712635

In [None]:
# from sklearn.linear_model import LinearRegression

# linear_model = LinearRegression()
# linear_model.fit(x_train,y_train)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVR

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    X_model_select, y, test_size=0.15, random_state=42, stratify=X['smoker']
)

# Define categorical and numerical features
numerical_features = ['smoker', 'age',  
                    'smoker_child',	
                    'bmi_category_obese', 'bmi_category_over_weight']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

# Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lasso', Lasso())
])

# GridSearchCV

params = {"lasso__alpha": [0.1, 1, 10],
     "lasso__selection": ["cyclic", "random"]}

grid = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
grid.fit(x_train, y_train)

# Best parameters and score
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

### Dropped based on p-values (strict)


In [17]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler, PolynomialFeatures
# from sklearn.linear_model import Lasso
# from sklearn.model_selection import train_test_split, GridSearchCV

# # Split data
# x_train, x_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=X['smoker']
# )

# # Define categorical and numerical features
# numerical_features = ['smoker_age', 'smoker', 'not_smoker', 'smoker_child',	'not_smoker_age', 'age', 'non_smoker_child',
#                       'bmi_category_obese', 'bmi', 'age_children','bmi_category_over_weight', 'bmi_category_normal_weight', 
#                       'children', 'region_southeast', 'child_stat']

# # Preprocessor
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features)
#     ]
# )

# # Pipeline
# model_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('poly', PolynomialFeatures(degree=2)),
#     ('lasso', Lasso())
# ])

# # GridSearchCV
# params = {
#     "lasso__alpha": [0.1, 1, 10],
#     "lasso__selection": ["cyclic", "random"]
# }

# grid = GridSearchCV(model_pipeline, param_grid=params, cv=5, error_score='raise')
# grid.fit(x_train, y_train)

# # Best parameters and score
# print("Best Params:", grid.best_params_)
# print("Best Score:", grid.best_score_)