## Imports

In [None]:
# System and file handling
import os

# Data handling and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Model selection and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Advanced model
from xgboost import XGBRegressor

# Saving models
import joblib

# Date
from datetime import datetime


## Clean Dataset

In [None]:
# Load dataset
df = pd.read_csv("Insurance Premium Prediction Dataset.csv")

# Fill missing numeric values with median
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Annual Income"].fillna(df["Annual Income"].median(), inplace=True)
df["Number of Dependents"].fillna(df["Number of Dependents"].median(), inplace=True)
df["Credit Score"].fillna(df["Credit Score"].median(), inplace=True)
df["Health Score"].fillna(df["Health Score"].median(), inplace=True)
df["Premium Amount"].fillna(df["Premium Amount"].median(), inplace=True)

# Fill missing categorical values
df["Occupation"].fillna("Unknown", inplace=True)
df["Customer Feedback"].fillna(df["Customer Feedback"].mode()[0], inplace=True)
df["Marital Status"].fillna(df["Marital Status"].mode()[0], inplace=True)

# Fill Previous Claims (assumed as no claims)
df["Previous Claims"].fillna(0.0, inplace=True)

# Save cleaned dataset for preview
df.to_csv("cleaned_insurance_data.csv", index=False)

# Preview result
print("✅ Dataset cleaned successfully.")
print("Shape:", df.shape)
print("Remaining missing values:")
print(df.isnull().sum())

# Fixed data type

In [None]:
# Load the cleaned dataset
df = pd.read_csv("cleaned_insurance_data.csv")  # Save file from previous session

# Fix data types
df["Number of Dependents"] = df["Number of Dependents"].astype(int)
df["Previous Claims"] = df["Previous Claims"].astype(int)
df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"], errors="coerce")

# Print results for confirmation
print("✅ Data types corrected successfully.")
print("\nUpdated Data Types:\n", df.dtypes)

# Save the corrected dataset (optional)
df.to_csv("Typed_Insurance_Dataset.csv", index=False)


# Address skewed distributions for numerical features

In [None]:
# Load dataset
df = pd.read_csv("Typed_Insurance_Dataset.csv")

# List of numerical columns to check for skew
numeric_cols = [
    "Annual Income", "Credit Score", "Premium Amount",
    "Health Score", "Age", "Number of Dependents", "Previous Claims"
]

# Calculate skewness
skew_vals = df[numeric_cols].skew()

print("🔍 Skewness of numerical features:")
print(skew_vals)
print("\n📌 Highly skewed features (|skew| > 1):")

# Threshold for high skew
threshold = 1
high_skew_cols = skew_vals[abs(skew_vals) > threshold].index.tolist()
print(high_skew_cols)

# Apply log1p transformation to fix right skew
for col in high_skew_cols:
    # Skip columns with negative or NaN values
    if (df[col] < 0).any():
        print(f"⚠️ Skipping {col} due to negative values.")
        continue
    df[col + "_log"] = np.log1p(df[col])
    print(f"✅ Transformed {col} → {col}_log")

# visualize one transformed feature to preview
if high_skew_cols:
    first = high_skew_cols[0]
    sns.histplot(df[first], kde=True)
    plt.title(f"{first} - Before Log Transform")
    plt.show()

    sns.histplot(df[first + "_log"], kde=True)
    plt.title(f"{first} - After Log Transform")
    plt.show()

# Save to new CSV for reference
df.to_csv("transformed.csv", index=False)
print("\n✅ Skewed features transformed and saved to 'cleaned_skew_fixed.csv'")


# On to EDA

# First step involved performing Univariate Analysis

In [None]:
# Load cleaned dataset
df = pd.read_csv("transformed.csv")  # From previous cell

# Set seaborn style
sns.set(style="whitegrid")

# Create directory for saving plots
output_dir = "plots/univariate"
os.makedirs(output_dir, exist_ok=True)

# ----------- Numerical Features -----------
numerical_features = ["Age", "Annual Income", "Number of Dependents", "Health Score",
                      "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration",
                      "Premium Amount"]

# Summary statistics
print("🔢 Summary Statistics:")
print(df[numerical_features].describe())

# Histograms for numerical features
for col in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, color="skyblue")
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    safe_name = col.lower().replace(" ", "_")
    plt.savefig(f"{output_dir}/hist_{safe_name}.png")
    plt.close()

# ----------- Categorical Features -----------
categorical_features = ["Gender", "Marital Status", "Education Level", "Occupation",
                        "Location", "Policy Type", "Customer Feedback", "Smoking Status",
                        "Exercise Frequency", "Property Type"]

# Count plots for categorical features
for col in categorical_features:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, palette="Set2", order=df[col].value_counts().index)
    plt.title(f"Count Plot of {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    safe_name = col.lower().replace(" ", "_")
    plt.savefig(f"{output_dir}/count_{safe_name}.png")
    plt.close()

print(f"✅ Univariate plots saved in: {output_dir}/")


# Second step involved performing Bivariate Analysis

In [None]:
# Load dataset
df = pd.read_csv("transformed.csv")

# ========== 1. NUMERICAL vs NUMERICAL ==========
# Correlation heatmap
plt.figure(figsize=(10, 8))
corr = df[["Annual Income", "Health Score", "Credit Score", "Premium Amount"]].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig("plots/bivariate_correlation_heatmap.png")
plt.close()

# Scatterplot: Income vs Premium
sns.scatterplot(data=df, x="Annual Income", y="Premium Amount", alpha=0.4)
plt.title("Annual Income vs Premium Amount")
plt.tight_layout()
plt.savefig("plots/scatter_income_premium.png")
plt.close()

# ========== 2. CATEGORICAL vs NUMERICAL ==========
cat_num_pairs = [
    ("Gender", "Premium Amount"),
    ("Education Level", "Premium Amount"),
    ("Occupation", "Premium Amount"),
    ("Smoking Status", "Premium Amount"),
]

for cat, num in cat_num_pairs:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x=cat, y=num, palette="Set3")
    plt.xticks(rotation=45)
    plt.title(f"{cat} vs {num}")
    plt.tight_layout()
    plt.savefig(f"plots/box_{cat.lower().replace(' ', '_')}_vs_{num.lower().replace(' ', '_')}.png")
    plt.close()

# ========== 3. CATEGORICAL vs CATEGORICAL ==========
cat_cat_pairs = [
    ("Gender", "Smoking Status"),
    ("Marital Status", "Exercise Frequency"),
    ("Occupation", "Property Type"),
]

for cat1, cat2 in cat_cat_pairs:
    cross_tab = pd.crosstab(df[cat1], df[cat2])
    plt.figure(figsize=(10, 6))
    sns.heatmap(cross_tab, annot=True, fmt="d", cmap="YlGnBu")
    plt.title(f"{cat1} vs {cat2}")
    plt.tight_layout()
    plt.savefig(f"plots/heatmap_{cat1.lower().replace(' ', '_')}_vs_{cat2.lower().replace(' ', '_')}.png")
    plt.close()

print("✅ Bivariate analysis completed and plots saved to 'plots/' folder.")


# Third step involved performing Multivariate Analysis

In [None]:
# Load dataset
df = pd.read_csv("transformed.csv")

# Set seaborn style
sns.set(style="white")

# Create output directory
output_dir = "plots/multivariate"
os.makedirs(output_dir, exist_ok=True)

# Numerical columns for correlation
numerical_features = ["Age", "Annual Income", "Number of Dependents", "Health Score",
                      "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration",
                      "Premium Amount"]

# ----------- 1. Correlation Heatmap -----------
plt.figure(figsize=(10, 8))
corr_matrix = df[numerical_features].corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap (Numerical Features)")
plt.tight_layout()
plt.savefig(f"{output_dir}/correlation_heatmap.png")
plt.close()

# ----------- 2. Pairplot (Sampled for speed) -----------
sample_df = df[numerical_features].sample(2000, random_state=42)  # Reduce for speed/memory
sns.pairplot(sample_df)
plt.savefig(f"{output_dir}/pairplot_numeric.png")
plt.close()

# ----------- 3. Box plots vs. Premium Amount for selected categorical -----------
categorical_features = ["Gender", "Marital Status", "Education Level", "Occupation",
                        "Policy Type", "Smoking Status", "Exercise Frequency", "Property Type"]

for cat in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df, x=cat, y="Premium Amount", palette="pastel")
    plt.xticks(rotation=45)
    plt.title(f"Premium Amount by {cat}")
    plt.tight_layout()
    safe_name = cat.lower().replace(" ", "_")
    plt.savefig(f"{output_dir}/premium_by_{safe_name}.png")
    plt.close()

    # ----------- 4. Grouped Bar Plots (Categorical vs Categorical with Hue) -----------

# Grouped barplot examples
grouped_combinations = [
    ("Smoking Status", "Exercise Frequency", "Policy Type"),
    ("Education Level", "Marital Status", "Gender"),
    ("Location", "Property Type", "Policy Type"),
]

for x_col, y_col, hue_col in grouped_combinations:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x=x_col, hue=hue_col, order=df[x_col].value_counts().index, palette="Set2")
    plt.title(f"{x_col} vs {y_col} grouped by {hue_col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    safe_name = f"{x_col}_{y_col}_by_{hue_col}".lower().replace(" ", "_")
    plt.savefig(f"{output_dir}/groupedbar_{safe_name}.png")
    plt.close()

    # ----------- 5. Heatmaps for Categorical Co-occurrence -----------

categorical_pairs = [
    ("Gender", "Policy Type"),
    ("Education Level", "Occupation"),
    ("Location", "Property Type"),
    ("Customer Feedback", "Smoking Status")
]

for row_col, col_col in categorical_pairs:
    cross_tab = pd.crosstab(df[row_col], df[col_col])
    
    plt.figure(figsize=(10, 6))
    sns.heatmap(cross_tab, annot=True, fmt="d", cmap="YlGnBu", linewidths=.5)
    plt.title(f"Co-occurrence Heatmap: {row_col} vs {col_col}")
    plt.xlabel(col_col)
    plt.ylabel(row_col)
    plt.tight_layout()
    
    safe_name = f"heatmap_{row_col}_{col_col}".lower().replace(" ", "_")
    plt.savefig(f"{output_dir}/{safe_name}.png")
    plt.close()



print(f"✅ Multivariate plots saved to: {output_dir}/")


## Identified correlations and trends that impact Premium Amount.

# Premium vs One

In [None]:
# === Load dataset ===
df = pd.read_csv("transformed.csv")

# === Output directory ===
output_dir = "output_graphs/bivariate"
os.makedirs(output_dir, exist_ok=True)

# === Set plot style ===
sns.set(style="whitegrid")
target_col = "Premium Amount"

# === Identify column types ===
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

# Remove target from numerical features
if target_col in numerical_cols:
    numerical_cols.remove(target_col)

# --- 1. Correlation heatmap ---
corr = df[numerical_cols + [target_col]].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", square=True)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig(f"{output_dir}/correlation_heatmap.png")
plt.close()

# --- 2. Boxplots: Categorical vs Target ---
for col in categorical_cols:
    if df[col].nunique() <= 10:  # Avoid high-cardinality plots
        plt.figure(figsize=(8, 6))
        sns.boxplot(data=df, x=col, y=target_col)
        plt.title(f"{col} vs {target_col}")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"{output_dir}/boxplot_{col}.png")
        plt.close()

# --- 3. Scatterplots: Numerical vs Target ---
for col in numerical_cols:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x=col, y=target_col, alpha=0.5)
    plt.title(f"{col} vs {target_col}")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/scatterplot_{col}.png")
    plt.close()

print("✅ Bivariate EDA plots saved to:", output_dir)


# Premium vs multiples

In [None]:
# CONFIG
TARGET = 'Premium Amount'
INPUT_FILE = 'transformed.csv'
OUTPUT_DIR = 'output_graphs/multivariate'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# LOAD DATA
df = pd.read_csv(INPUT_FILE)
print(f"✅ Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# COLUMN TYPES
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop(TARGET, errors='ignore').tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

### 🔷 NUMERICAL FEATURES vs TARGET
corr = df[numerical_cols + [TARGET]].corr()[[TARGET]].drop(TARGET)
corr = corr.sort_values(by=TARGET, ascending=False)
print("📊 Numerical correlations:\n", corr)

for col in corr.index:
    plt.figure(figsize=(6, 4))
    sns.regplot(data=df, x=col, y=TARGET, scatter_kws={'s': 10}, line_kws={'color': 'red'})
    plt.title(f'{col} vs {TARGET}')
    plt.tight_layout()
    plt.savefig(f'{OUTPUT_DIR}/{col}_vs_{TARGET}_regplot.png')
    plt.close()

### 🔶 CATEGORICAL FEATURES vs TARGET
for col in categorical_cols:
    if df[col].nunique() < 50:  # Skip too many-category columns
        plt.figure(figsize=(8, 4))
        sns.boxplot(data=df, x=col, y=TARGET)
        plt.xticks(rotation=45)
        plt.title(f'{col} vs {TARGET}')
        plt.tight_layout()
        plt.savefig(f'{OUTPUT_DIR}/{col}_vs_{TARGET}_boxplot.png')
        plt.close()

print(f"✅ Multivariate plots saved to: {OUTPUT_DIR}")


## Feature Engineering

In [None]:
# Load data
df = pd.read_csv("transformed.csv")

# Step 1: Feature Engineering
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy_Years_Since'] = datetime.now().year - df['Policy Start Date'].dt.year
df.drop(columns=['Policy Start Date'], inplace=True)

# Step 2: Separate target and features
target = 'Premium Amount'
X = df.drop(columns=[target])
y = df[target]

# Step 3: Identify column types
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Step 4: Impute missing values
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

X[numerical_cols] = num_imputer.fit_transform(X[numerical_cols])
X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

# Step 5: Encode categoricals
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_cat = encoder.fit_transform(X[categorical_cols])
encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(categorical_cols))

# Step 6: Scale numericals
scaler = StandardScaler()
scaled_num = scaler.fit_transform(X[numerical_cols])
scaled_num_df = pd.DataFrame(scaled_num, columns=numerical_cols)

# Step 7: Combine processed features
X_processed = pd.concat([scaled_num_df.reset_index(drop=True), encoded_cat_df.reset_index(drop=True)], axis=1)

# Step 8: Random Forest for Feature Selection
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_processed, y)

# Get feature importances
importances = rf.feature_importances_
feature_names = X_processed.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Select top K features
k = 30  # Change this as needed
top_features = importance_df['Feature'].iloc[:k].tolist()
X_selected_df = X_processed[top_features]

# (Optional) Save selected data with target
X_selected_df[target] = y.reset_index(drop=True)
X_selected_df.to_csv("processed_rf_selected_data.csv", index=False)


## Model Training

In [None]:
# Load processed data
data = pd.read_csv("processed_rf_selected_data.csv")

# Split features and target
X = data.drop("Premium Amount", axis=1)
y = data["Premium Amount"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save splits to disk for reuse
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print(f"{name} - MAE: {mae:.2f}, MSE: {mse:.2f}, R²: {r2:.4f}")

    # Save the best model manually
    if name == "Random Forest":
        joblib.dump(model, "random_forest_model.pkl")


# Data Tuning

In [None]:
# Load saved splits
X_train = np.load("X_train.npy", allow_pickle=True)
X_test = np.load("X_test.npy", allow_pickle=True)
y_train = np.load("y_train.npy", allow_pickle=True)
y_test = np.load("y_test.npy", allow_pickle=True)

# --- Random Forest Hyperparameter Tuning ---
print("\n🔍 Tuning Random Forest...")

rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
}

rf = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='r2', n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)

print("✅ Best Random Forest Params:", rf_grid.best_params_)
rf_best = rf_grid.best_estimator_

# --- XGBoost Hyperparameter Tuning ---
print("\n🔍 Tuning XGBoost...")

xgb_params = {
    "n_estimators": [100, 200],
    "max_depth": [3, 6],
    "learning_rate": [0.05, 0.1],
}

xgb = XGBRegressor(random_state=42, verbosity=0)
xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring='r2', n_jobs=-1, verbose=1)
xgb_grid.fit(X_train, y_train)

print("✅ Best XGBoost Params:", xgb_grid.best_params_)
xgb_best = xgb_grid.best_estimator_

# --- Evaluation ---
print("\n📊 Final Evaluation on Test Set:")
for name, model in {
    "Random Forest (Tuned)": rf_best,
    "XGBoost (Tuned)": xgb_best
}.items():
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print(f"\n{name}:")
    print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, R²: {r2:.4f}")

    # Save model
    filename = name.lower().replace(" ", "_") + ".c"
    joblib.dump(model, filename)
