In [23]:
# 📌 Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os
import joblib

# 📌 Load Dataset
df = pd.read_csv('../Data/raw_data.csv')
print("✅ Data Loaded:", df.shape)

# 📌 Standardize Claim_Status values
df['Claim_Status'] = df['Claim_Status'].astype(str).str.strip().str.title()

# 📌 Show unique values before mapping
print("🧾 Unique Claim_Status values:", df['Claim_Status'].unique())

# 📌 Map Target Column
target_mapping = {'Approved': 1, 'Rejected': 0}
df = df[df['Claim_Status'].isin(target_mapping)]
df['Claim_Status'] = df['Claim_Status'].map(target_mapping)

# 📌 Print Class Distribution
print("✅ Target class distribution:\n", df['Claim_Status'].value_counts())

# 📌 Balance the Dataset
df_0 = df[df['Claim_Status'] == 0]
df_1 = df[df['Claim_Status'] == 1]
min_count = min(len(df_0), len(df_1))

if min_count == 0:
    raise ValueError("❌ One class has 0 records. Check raw data.")

df = pd.concat([
    df_0.sample(min_count, random_state=42),
    df_1.sample(min_count, random_state=42)
]).sample(frac=1, random_state=42)

print("✅ After Balancing:\n", df['Claim_Status'].value_counts())

# 📌 Encode Categorical Features
label_encoders = {}
categorical_cols = ['Gender', 'Smoker', 'Type_of_Claim']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    print(f"🔤 {col} mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# 📌 Define Features and Target
features = ['Age', 'Gender', 'BMI', 'Smoker', 'Number_of_Dependents',
            'Type_of_Claim', 'Claim_Amount', 'Number_of_Previous_Claims',
            'Hospital_Stay_Duration', 'Doctor_Visits']

X = df[features]
y = df['Claim_Status']

# 📌 Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 📌 Final DataFrame
df_scaled = pd.DataFrame(X_scaled, columns=features)
df_scaled['Claim_Status'] = y.values

# 📌 Save Preprocessed Data & Scaler
os.makedirs('../Data', exist_ok=True)
os.makedirs('../App/model', exist_ok=True)
df_scaled.to_csv('../Data/preprocessed_data.csv', index=False)
joblib.dump(scaler, '../App/model/scaler.pkl')

print("✅ Preprocessed data saved as 'preprocessed_data.csv'")
print("✅ Scaler saved as 'scaler.pkl'")


✅ Data Loaded: (2000, 11)
🧾 Unique Claim_Status values: ['Rejected' 'Approved']
✅ Target class distribution:
 0    1000
1    1000
Name: Claim_Status, dtype: int64
✅ After Balancing:
 1    1000
0    1000
Name: Claim_Status, dtype: int64
🔤 Gender mapping: {'Female': 0, 'Male': 1}
🔤 Smoker mapping: {'No': 0, 'Yes': 1}
🔤 Type_of_Claim mapping: {'Accident': 0, 'Fire': 1, 'Health': 2, 'Theft': 3}
✅ Preprocessed data saved as 'preprocessed_data.csv'
✅ Scaler saved as 'scaler.pkl'
