In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [2]:

# --- 1. CONFIGURATION ---
FILE_PATH = 'classification.csv'

# Define all column headers in the correct order
COLUMN_HEADERS = [
    'person_id', 'age', 'sex', 'region', 'urban_rural', 'income', 'education',
    'marital_status', 'employment_status', 'household_size', 'dependents', 'bmi',
    'smoker', 'alcohol_freq', 'visits_last_year', 'hospitalizations_last_3yrs',
    'days_hospitalized_last_3yrs', 'medication_count', 'systolic_bp',
    'diastolic_bp', 'ldl', 'hba1c', 'plan_type', 'network_tier', 'deductible',
    'copay', 'policy_term_years', 'policy_changes_last_2yrs', 'provider_quality',
    'risk_score', 'annual_medical_cost', 'annual_premium', 'monthly_premium',
    'claims_count', 'avg_claim_amount', 'total_claims_paid', 'chronic_count',
    'hypertension', 'diabetes', 'asthma', 'copd', 'cardiovascular_disease',
    'cancer_history', 'kidney_disease', 'liver_disease', 'arthritis',
    'mental_health', 'proc_imaging_count', 'proc_surgery_count',
    'proc_physio_count', 'proc_consult_count', 'proc_lab_count',
    'is_high_risk', 'had_major_procedure'
]


In [3]:

# --- 2. DATA LOADING ---
try:
    # Use header=0 to correctly read the first row as column names
    df = pd.read_csv(FILE_PATH, header=0)
    print(f"Successfully loaded data from '{FILE_PATH}'. Shape: {df.shape}")
except FileNotFoundError:
    print(f"--- ERROR ---")
    print(f"File not found at: '{FILE_PATH}'")
    print(f"Please make sure the file is in the same directory or update the path.")
    raise
except Exception as e:
    print(f"An error occurred loading the file: {e}")
    raise



Successfully loaded data from 'classification.csv'. Shape: (100000, 54)


In [4]:
# --- 3. MANUAL ORDINAL MAPPING ---
education_map = {
    'No HS': 0, 'Below High School': 0, 'HS': 1, 'High School': 1,
    'Some College': 2, "Bachelor's": 3, "Master's": 4, 'Doctorate': 5
}
df['education'] = df['education'].apply(lambda x: education_map.get(x, 0))

network_tier_map = {'Bronze': 0, 'Silver': 1, 'Gold': 2, 'Platinum': 3}
df['network_tier'] = df['network_tier'].apply(lambda x: network_tier_map.get(x, 0))
print("Completed ordinal mapping.")

# --- 4. DEFINE FEATURE GROUPS ---
TARGET_COLUMN = 'is_high_risk'
DROP_COLUMNS = ['person_id', 'had_major_procedure']

ONE_HOT_COLS = [
    'sex', 'region', 'urban_rural', 'marital_status', 'employment_status',
    'smoker', 'alcohol_freq', 'plan_type'
]
BINARY_COLS = [
    'hypertension', 'diabetes', 'asthma', 'copd', 'cardiovascular_disease',
    'cancer_history', 'kidney_disease', 'liver_disease', 'arthritis',
    'mental_health'
]
all_feature_cols = [
    col for col in COLUMN_HEADERS if col not in [TARGET_COLUMN] + DROP_COLUMNS
]
NUMERIC_COLS = [
    col for col in all_feature_cols if col not in ONE_HOT_COLS + BINARY_COLS
]

# Filter lists to only include columns that actually exist in the dataframe
NUMERIC_COLS = [col for col in NUMERIC_COLS if col in df.columns]
ONE_HOT_COLS = [col for col in ONE_HOT_COLS if col in df.columns]
BINARY_COLS = [col for col in BINARY_COLS if col in df.columns]
print("Defined feature groups for preprocessing.")



Completed ordinal mapping.
Defined feature groups for preprocessing.


In [5]:
# --- 5. CREATE X AND y, THEN SPLIT DATA ---
X = df.drop(columns=[TARGET_COLUMN] + DROP_COLUMNS)
y = df[TARGET_COLUMN]

# Ensure the target variable 'y' is a consistent numeric type (int)
y = y.astype(int)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Split data into Train ({X_train.shape}) and Test ({X_test.shape}) sets.")



Split data into Train ((80000, 51)) and Test ((20000, 51)) sets.


In [6]:
# --- 6. CREATE PREPROCESSING PIPELINE ---
numeric_transformer = StandardScaler()
one_hot_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, NUMERIC_COLS),
        ('ohe', one_hot_transformer, ONE_HOT_COLS),
        ('bin', 'passthrough', BINARY_COLS)
    ],
    remainder='passthrough'
)



In [7]:
# --- 7. APPLY PREPROCESSING ---
print("Fitting preprocessor and transforming X_train...")
X_train_processed = preprocessor.fit_transform(X_train)

print("Transforming X_test...")
X_test_processed = preprocessor.transform(X_test)

# Get the final number of features AFTER one-hot encoding
INPUT_DIM = X_train_processed.shape[1]
print(f"Preprocessing complete. Input dimension for ANN is: {INPUT_DIM}")
print("-" * 30)

# --- 8. DEFINE ANN MODEL ---
print("Building ANN model...")
model = Sequential()
model.add(Dense(10, activation="tanh", input_dim=INPUT_DIM))
model.add(Dense(10, activation="tanh"))
model.add(Dense(10, activation="tanh"))
model.add(Dense(10, activation="tanh"))
model.add(Dense(1, activation="sigmoid"))
print("Model built successfully.")



Fitting preprocessor and transforming X_train...
Transforming X_test...
Preprocessing complete. Input dimension for ANN is: 73
------------------------------
Building ANN model...
Model built successfully.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# --- 9. MANUAL XAVIER WEIGHT INITIALIZATION ---
print("Manually setting model weights using Xavier Initialization...")

# Get the initial shapes of weights and biases
initial_weights = model.get_weights().copy()

# Layer 0 (Input -> Hidden 1)
fan_in = INPUT_DIM
initial_weights[0] = np.random.randn(fan_in, 10) * np.sqrt(1 / fan_in)
initial_weights[1] = np.zeros(initial_weights[1].shape) # Bias

# Layer 1 (Hidden 1 -> Hidden 2)
fan_in = 10
initial_weights[2] = np.random.randn(fan_in, 10) * np.sqrt(1 / fan_in)
initial_weights[3] = np.zeros(initial_weights[3].shape) # Bias

# Layer 2 (Hidden 2 -> Hidden 3)
fan_in = 10
initial_weights[4] = np.random.randn(fan_in, 10) * np.sqrt(1 / fan_in)
initial_weights[5] = np.zeros(initial_weights[5].shape) # Bias

# Layer 3 (Hidden 3 -> Hidden 4)
fan_in = 10
initial_weights[6] = np.random.randn(fan_in, 10) * np.sqrt(1 / fan_in)
initial_weights[7] = np.zeros(initial_weights[7].shape) # Bias

# Layer 4 (Hidden 4 -> Output)
fan_in = 10
initial_weights[8] = np.random.randn(fan_in, 1) * np.sqrt(1 / fan_in)
initial_weights[9] = np.zeros(initial_weights[9].shape) # Bias

# Set the new weights to the model
model.set_weights(initial_weights)
print("Manual weights set.")
print("-" * 30)



Manually setting model weights using Xavier Initialization...
Manual weights set.
------------------------------


In [10]:
# --- 10. COMPILE AND TRAIN MODEL ---
print("Compiling model...")
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Print a summary of the model
model.summary()
print("\n--- Starting Model Training ---")

history = model.fit(
    X_train_processed, 
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_processed, y_test),
    verbose=2  # Shows one line per epoch
)

print("\n--- Model Training Complete ---")

# --- 11. EVALUATE MODEL ---
loss, accuracy = model.evaluate(X_test_processed, y_test, verbose=0)
print("-" * 30)
print(f"Final Test Accuracy: {accuracy*100:.2f}%")
print(f"Final Test Loss: {loss:.4f}")
print("-" * 30)


Compiling model...



--- Starting Model Training ---
Epoch 1/10
2500/2500 - 4s - 2ms/step - accuracy: 0.9984 - loss: 0.0040 - val_accuracy: 0.9987 - val_loss: 0.0037
Epoch 2/10
2500/2500 - 2s - 992us/step - accuracy: 0.9990 - loss: 0.0027 - val_accuracy: 0.9998 - val_loss: 5.2490e-04
Epoch 3/10
2500/2500 - 2s - 992us/step - accuracy: 0.9988 - loss: 0.0037 - val_accuracy: 0.9991 - val_loss: 0.0025
Epoch 4/10
2500/2500 - 3s - 1ms/step - accuracy: 0.9982 - loss: 0.0046 - val_accuracy: 0.9995 - val_loss: 0.0018
Epoch 5/10
2500/2500 - 2s - 974us/step - accuracy: 0.9987 - loss: 0.0041 - val_accuracy: 0.9973 - val_loss: 0.0069
Epoch 6/10
2500/2500 - 3s - 1ms/step - accuracy: 0.9986 - loss: 0.0041 - val_accuracy: 0.9991 - val_loss: 0.0023
Epoch 7/10
2500/2500 - 2s - 977us/step - accuracy: 0.9987 - loss: 0.0034 - val_accuracy: 0.9990 - val_loss: 0.0030
Epoch 8/10
2500/2500 - 2s - 959us/step - accuracy: 0.9987 - loss: 0.0036 - val_accuracy: 0.9992 - val_loss: 0.0030
Epoch 9/10
2500/2500 - 2s - 938us/step - accuracy