In [1]:
import statsmodels.api as sm
import pandas as pd

In [2]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE.csv')
df_LE = pd.read_csv('../DataSet/RegressionData/healthinsurance_LE.csv')

# Train-Test Split 80-20 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

# Define features and target
X = df_OHE.drop(columns=['claim'])  # Features
y = df_OHE['claim']  # Target variable

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### --- 1. Standard Linear Regression (No Regularization) --- ###
model = LinearRegression()
model.fit(X_train, y_train)

# Get Top 10 Coefficients
coefficients = pd.Series(model.coef_, index=X.columns)
print("🔹 Top 10 Features - Linear Regression:")
print(coefficients.nlargest(10))

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate performance
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Linear Regression R² on TRAIN set: {r2_train:.4f}")
print(f"Linear Regression R² on TEST set: {r2_test:.4f}")
print("--------------------------------------------------")

### --- 2. Ridge Regression (L2 Regularization) --- ###
ridge = Ridge(alpha=1.0)  # Adjust alpha for more regularization (higher alpha = more shrinkage)
ridge.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge.predict(X_test)

# Evaluate Ridge Regression performance
r2_ridge_test = r2_score(y_test, y_pred_ridge)
print(f"🔹 Ridge Regression R² on TEST set: {r2_ridge_test:.4f}")
print("--------------------------------------------------")

### --- 3. Lasso Regression (L1 Regularization - Feature Selection) --- ###
lasso = Lasso(alpha=0.1)  # Higher alpha increases sparsity (more zero coefficients)
lasso.fit(X_train, y_train)

# Make predictions
y_pred_lasso = lasso.predict(X_test)

# Evaluate Lasso Regression performance
r2_lasso_test = r2_score(y_test, y_pred_lasso)
print(f"🔹 Lasso Regression R² on TEST set: {r2_lasso_test:.4f}")
print("--------------------------------------------------")

# Print non-zero coefficients (Lasso does automatic feature selection)
lasso_coefficients = pd.Series(lasso.coef_, index=X.columns)
print("🔹 Features Retained in Lasso (Non-Zero Coefficients):")
print(lasso_coefficients[lasso_coefficients != 0])


🔹 Top 10 Features - Linear Regression:
city_Chicago         6.764224e+12
city_Warwick         6.764224e+12
city_Eureka          6.764224e+12
city_Waterloo        6.764224e+12
city_Huntsville      6.764224e+12
city_Lovelock        6.764224e+12
city_Orlando         6.764224e+12
city_Oceanside       6.764224e+12
city_Indianapolis    6.764224e+12
city_IowaCity        6.764224e+12
dtype: float64
Linear Regression R² on TRAIN set: 0.7659
Linear Regression R² on TEST set: 0.7855
--------------------------------------------------
🔹 Ridge Regression R² on TEST set: 0.7855
--------------------------------------------------
🔹 Lasso Regression R² on TEST set: 0.7855
--------------------------------------------------
🔹 Features Retained in Lasso (Non-Zero Coefficients):
age                      271.134282
sex                     -186.040038
weight                   -33.506527
bmi                      191.683014
no_of_dependents         467.045583
                           ...     
job_title_Police

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

# Define features and target
X = df_LE.drop(columns=['claim'])  # Features
y = df_LE['claim']  # Target variable

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### --- 1. Standard Linear Regression (No Regularization) --- ###
model = LinearRegression()
model.fit(X_train, y_train)

# Get Top 5 Coefficients
coefficients = pd.Series(model.coef_, index=X.columns)
print("🔹 Top 5 Features - Linear Regression:")
print(coefficients.nlargest(5))

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate model performance (R-squared)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Linear Regression R² on TRAIN set: {r2_train:.4f}")
print(f"Linear Regression R² on TEST set: {r2_test:.4f}")
print("--------------------------------------------------")

### --- 2. Ridge Regression (L2 Regularization) --- ###
ridge = Ridge(alpha=1.0)  # Adjust alpha


🔹 Top 5 Features - Linear Regression:
smoker              21164.597940
diabetes             1335.868471
no_of_dependents      466.167610
age                   276.833118
bmi                   231.247057
dtype: float64
Linear Regression R² on TRAIN set: 0.7408
Linear Regression R² on TEST set: 0.7667
--------------------------------------------------


# Now compare with bootstrap version

In [5]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE_bootstrapp.csv')
df_LE = pd.read_csv('../DataSet/RegressionData/healthinsurance_LE_bootstrapp.csv')

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

# Define features and target
X = df_OHE.drop(columns=['claim'])  # Features
y = df_OHE['claim']  # Target variable

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### --- 1. Standard Linear Regression (No Regularization) --- ###
model = LinearRegression()
model.fit(X_train, y_train)

# Get Top 10 Coefficients
coefficients = pd.Series(model.coef_, index=X.columns)
print("🔹 Top 10 Features - Linear Regression:")
print(coefficients.nlargest(10))

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate performance
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Linear Regression R² on TRAIN set: {r2_train:.4f}")
print(f"Linear Regression R² on TEST set: {r2_test:.4f}")
print("--------------------------------------------------")

### --- 2. Ridge Regression (L2 Regularization) --- ###
ridge = Ridge(alpha=1.0)  # Adjust alpha for more regularization (higher alpha = more shrinkage)
ridge.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge.predict(X_test)

# Evaluate Ridge Regression performance
r2_ridge_test = r2_score(y_test, y_pred_ridge)
print(f"🔹 Ridge Regression R² on TEST set: {r2_ridge_test:.4f}")
print("--------------------------------------------------")

### --- 3. Lasso Regression (L1 Regularization - Feature Selection) --- ###
lasso = Lasso(alpha=0.1)  # Higher alpha increases sparsity (more zero coefficients)
lasso.fit(X_train, y_train)

# Make predictions
y_pred_lasso = lasso.predict(X_test)

# Evaluate Lasso Regression performance
r2_lasso_test = r2_score(y_test, y_pred_lasso)
print(f"🔹 Lasso Regression R² on TEST set: {r2_lasso_test:.4f}")
print("--------------------------------------------------")

# Print non-zero coefficients (Lasso does automatic feature selection)
lasso_coefficients = pd.Series(lasso.coef_, index=X.columns)
print("🔹 Features Retained in Lasso (Non-Zero Coefficients):")
print(lasso_coefficients[lasso_coefficients != 0])



🔹 Top 10 Features - Linear Regression:
smoker                              13545.683926
hereditary_diseases_HeartDisease     8833.747200
city_Warwick                         3677.057340
city_Waterloo                        3646.073870
hereditary_diseases_Cancer           3545.239993
job_title_CA                         3490.211589
job_title_Manager                    3430.215487
city_Indianapolis                    3179.646834
job_title_Engineer                   2677.150741
hereditary_diseases_Alzheimer        2621.446132
dtype: float64
Linear Regression R² on TRAIN set: 0.6407
Linear Regression R² on TEST set: 0.6590
--------------------------------------------------
🔹 Ridge Regression R² on TEST set: 0.6588
--------------------------------------------------
🔹 Lasso Regression R² on TEST set: 0.6590
--------------------------------------------------
🔹 Features Retained in Lasso (Non-Zero Coefficients):
age                      264.935679
sex                     -203.588344
weight    

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

# Define features and target
X = df_LE.drop(columns=['claim'])  # Features
y = df_LE['claim']  # Target variable

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### --- 1. Standard Linear Regression (No Regularization) --- ###
model = LinearRegression()
model.fit(X_train, y_train)

# Get Top 5 Coefficients
coefficients = pd.Series(model.coef_, index=X.columns)
print("🔹 Top 5 Features - Linear Regression:")
print(coefficients.nlargest(5))

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate model performance (R-squared)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Linear Regression R² on TRAIN set: {r2_train:.4f}")
print(f"Linear Regression R² on TEST set: {r2_test:.4f}")
print("--------------------------------------------------")

### --- 2. Ridge Regression (L2 Regularization) --- ###
ridge = Ridge(alpha=1.0)  # Adjust alpha


🔹 Top 5 Features - Linear Regression:
smoker              14436.983087
diabetes             1728.311669
no_of_dependents      512.950801
age                   266.550839
bmi                    39.992413
dtype: float64
Linear Regression R² on TRAIN set: 0.5954
Linear Regression R² on TEST set: 0.5860
--------------------------------------------------


Take-aways:
- Non boostrapp version has a lower R^2 surprisingly
- OHE works better for LS