In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import joblib
import json

# STEP 1: LOAD AND CLEAN DATA

In [10]:
print("Loading dataset...")
df = pd.read_csv('Sleep_Health_and_Lifestyle_Dataset.csv')

# 1.1 Remove ID/Occupation (Irrelevant for students)
# We drop 'Person ID' because it's just a label, not data.
if 'Person ID' in df.columns:
    df = df.drop(columns=['Person ID'])
# We drop occupation because students don't have these specific jobs.
if 'Occupation' in df.columns:
    df = df.drop(columns=['Occupation'])

# 1.2 Fix Blood Pressure (Crucial for Math)
# "126/83" -> Systolic: 126, Diastolic: 83
# on your website, you MUST have 2 separate input fields for this!
if 'Blood Pressure' in df.columns:
    df[['SystolicBP', 'DiastolicBP']] = df['Blood Pressure'].str.split('/', expand=True).astype(int)
    df = df.drop(columns=['Blood Pressure'])

Loading dataset...


# STEP 2: CREATE THE TARGET

In [11]:
# We turn "Sleep Disorder" text into a generic "Risk" number.
# 0 = No Risk, 1 = High Risk
def categorize_risk(val):
    if pd.isna(val) or val == 'None':
        return 0
    return 1

df['Risk_Label'] = df['Sleep Disorder'].apply(categorize_risk)
df = df.drop(columns=['Sleep Disorder'])

# STEP 3: MANUAL MAPPING

In [12]:
# Instead of letting the computer guess 0 or 1, we FORCE it.
# This guarantees your React form will match the Python logic 100%.

# Map Gender
gender_map = {'Male': 1, 'Female': 0}
df['Gender'] = df['Gender'].map(gender_map)

# Map BMI Category (Standardize spelling first)
# The dataset often has 'Normal Weight' and 'Normal'. We combine them.
df['BMI Category'] = df['BMI Category'].replace({'Normal Weight': 'Normal'})
bmi_map = {'Normal': 0, 'Overweight': 1, 'Obese': 2}
df['BMI Category'] = df['BMI Category'].map(bmi_map)

# Handle any rows that didn't match (Safety check)
df = df.dropna()

print("Data Mapped Successfully. Gender: Male=1, Female=0")

Data Mapped Successfully. Gender: Male=1, Female=0


# STEP 4: SPLIT & SCALE

In [13]:
X = df.drop(columns=['Risk_Label'])
y = df['Risk_Label']

# Split 80% Train / 20% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale (Normalize the range of data, e.g., Age 20 vs Heart Rate 80)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SAVE THE SCALER! You need this to process the user's input later.
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

# STEP 5: TRAIN MODELS

In [14]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(probability=True), # Added probability=True for % confidence later
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

# Dictionary to store accuracy so we can send it to the frontend later if needed
# List to store full performance data for the dashboard
benchmark_data = []

print("\nTraining Results:")

for name, model in models.items():
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Calculate all metrics
    # Note: Use average='weighted' if you have more than 2 classes (e.g., Low, Med, High risk)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store data in a dictionary
    model_result = {
        "algorithm": name,
        "accuracy": round(accuracy * 100, 1),
        "precision": round(precision * 100, 1),
        "recall": round(recall * 100, 1),
        "f1_score": round(f1 * 100, 1),
        "status": "Ready"  # Default status
    }
    
    benchmark_data.append(model_result)

    # Save the physical model file
    filename = f"{name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, filename)
    print(f"Saved {filename} (F1-Score: {f1:.2f})")

# Logic to determine the 'Deployed' model (e.g., highest F1 score)
# This matches the green 'Deployed' badge in your image
best_model = max(benchmark_data, key=lambda x: x['f1_score'])
best_model['status'] = 'Deployed'

# Save the benchmark data to a JSON file for the dashboard
with open('model_benchmarks.json', 'w') as f:
    json.dump(benchmark_data, f, indent=4)

print("\nBenchmarking data saved to 'model_benchmarks.json'")


Training Results:
Saved logistic_regression_model.pkl (F1-Score: 0.93)
Saved svm_model.pkl (F1-Score: 0.96)
Saved random_forest_model.pkl (F1-Score: 0.96)

Benchmarking data saved to 'model_benchmarks.json'
