In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

path = "/kaggle/input/datasets/mohankrishnathalla/medical-insurance-cost-prediction/medical_insurance.csv"
df = pd.read_csv(path)

df.head()


In [None]:
df.columns


In [None]:
# Define target
target = "annual_medical_cost"

# Drop leakage variables
leakage_vars = [
    "annual_premium",
    "monthly_premium",
    "claims_count",
    "avg_claim_amount",
    "total_claims_paid",
    "proc_imaging_count",
    "proc_surgery_count",
    "proc_physio_count",
    "proc_consult_count",
    "proc_lab_count",
    "had_major_procedure",
    "is_high_risk"
]

df_clean = df.drop(columns=leakage_vars)

# Separate X and y
X = df_clean.drop(columns=[target])
y = df_clean[target]

print("Final feature count:", X.shape[1])
print("Observations:", X.shape[0])


In [None]:
df_clean.columns


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Assume df_clean is your leakage-free dataframe

# 1️⃣ Separate predictors and target
target = "annual_medical_cost"
X = df_clean.drop(columns=[target, "person_id"])
y = df_clean[target]

# 2️⃣ Convert categorical columns to dummies
X = pd.get_dummies(X, drop_first=True)

# 3️⃣ Convert all bools to int
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)

# 4️⃣ Force all remaining columns to numeric
X = X.apply(pd.to_numeric, errors='coerce')

# 5️⃣ Fill any remaining NaNs (if any) with 0 or mean
X = X.fillna(0)

# 6️⃣ Convert everything to float64 (statsmodels safe)
X = X.astype(np.float64)

# 7️⃣ Add constant for intercept
X_const = sm.add_constant(X)

# 8️⃣ Fit Gamma GLM with log link
glm_gamma = sm.GLM(
    y.astype(np.float64),          # ensure target is float
    X_const,
    family=sm.families.Gamma(sm.families.links.log())
)

result = glm_gamma.fit()
print(result.summary())

# 9️⃣ Predict
df_clean['predicted_cost'] = result.predict(X_const)
df_clean[['person_id', 'annual_medical_cost', 'predicted_cost']].head()


In [None]:
#Extract coefficients and compute relativities
# Exponentiate GLM coefficients to get multiplicative risk relativities
relativities = pd.DataFrame({
    'feature': result.params.index,
    'coef': result.params.values,
    'relativity': np.exp(result.params.values)
})

# Sort by relativity descending
relativities = relativities.sort_values(by='relativity', ascending=False)
relativities.head(20)


In [None]:
#Focus on key risk factors
key_features = [c for c in relativities['feature'] if 'age' in c or 'smoker' in c or 'plan_type' in c or 'chronic_count' in c]
relativities[relativities['feature'].isin(key_features)]


In [None]:
#Build a simple rating table
# Example: baseline policy
baseline = X_const.median()  # use median of numeric features for baseline

# Function to compute expected cost for custom features
def predict_cost(modified_features):
    row = baseline.copy()
    for col, val in modified_features.items():
        if col in row.index:
            row[col] = val
    return result.predict(row.values.reshape(1, -1))[0]

# Example: non-smoker, smoker, age 30, 50
pred1 = predict_cost({'age':30, 'smoker_Former':0, 'smoker_Never':1})
pred2 = predict_cost({'age':30, 'smoker_Former':0, 'smoker_Never':0})  # smoker as baseline
print("Predicted cost non-smoker:", pred1)
print("Predicted cost smoker:", pred2)


In [None]:
#Attach risk relativities to features (like a real pricing table)
# Only numeric features with non-zero effect
rating_table = relativities[['feature', 'relativity']]
rating_table = rating_table[rating_table['feature'] != 'const']  # exclude baseline
rating_table.sort_values(by='relativity', ascending=False).head(20)
