In [None]:
# -*- coding: utf-8 -*-
"""train_anxiety_model.ipynb

This notebook preprocesses the teen phone addiction dataset,
engineers features, and then trains an XGBoost Regressor model
to predict 'Anxiety_Level'. The trained model is saved as 'anxiety_model.pkl'.
"""

# 1. Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
# import RandomForestRegressor # Removed
from xgboost import XGBRegressor # Added XGBoost
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# --- Configuration ---
# Adjust this path based on where your notebook is relative to the CSV file.
# Assuming this notebook is in 'teenaddiction/model/' and CSV is in 'teenaddiction/data/'
DATASET_PATH = '../data/teen_phone_addiction_dataset.csv'
MODEL_OUTPUT_PATH = 'anxiety_model.pkl' # Model will be saved in the same directory as this notebook

# --- 2. Load your dataset ---
try:
    df = pd.read_csv(DATASET_PATH)
    print("Dataset loaded successfully.")
    print(f"Initial shape: {df.shape}")
    print("Initial columns:", df.columns.tolist())
except FileNotFoundError:
    print(f"Error: Dataset not found at {DATASET_PATH}.")
    print("Please ensure 'teen_phone_addiction_dataset.csv' is in the 'data/' directory.")
    exit()

# --- 3. Data Cleaning and Preprocessing (Consistent with main training) ---

# Define columns to drop (identifiers and potentially redundant/leaky target-related columns)
# Note: 'Anxiety_Level' is the target here, so it's not dropped initially.
# 'Depression_Level' and 'Addiction_Level' are features for this model.
columns_to_drop_initial = ["ID", "Name", "Location", "Depression_Class", "Depression_Level_Class", "Anxiety_Class", "Usage_Bin", "Usage_Group"]
existing_columns_to_drop = [col for col in columns_to_drop_initial if col in df.columns]

if existing_columns_to_drop:
    df.drop(columns=existing_columns_to_drop, inplace=True)
    print(f"\nDropped columns: {existing_columns_to_drop}")
else:
    print("\nNo specified initial columns to drop were found in the DataFrame (or already dropped).")

# Handle missing values (dropping rows for simplicity)
print("\nMissing values before dropping NaNs:\n", df.isnull().sum())
df.dropna(inplace=True)
print("\nMissing values after dropping NaNs:\n", df.isnull().sum())

# Encode categorical columns using One-Hot Encoding
categorical_cols_to_encode = ['Gender', 'School_Grade', 'Phone_Usage_Purpose']
existing_categorical_cols = [col for col in categorical_cols_to_encode if col in df.columns]

if existing_categorical_cols:
    df = pd.get_dummies(df, columns=existing_categorical_cols, drop_first=True)
    print(f"\nOne-hot encoded columns: {existing_categorical_cols}")
else:
    print("\nNo specified categorical columns to encode were found.")

# --- 4. Feature Engineering (Consistent with main training) ---
required_base_cols = [
    'Daily_Usage_Hours', 'Screen_Time_Before_Bed', 'Time_on_Social_Media',
    'Time_on_Education', 'Time_on_Gaming', 'Weekend_Usage_Hours',
    'Phone_Checks_Per_Day', 'Apps_Used_Daily', 'Sleep_Hours'
]

if all(col in df.columns for col in required_base_cols):
    df['Night_Usage'] = df['Daily_Usage_Hours'] - df['Screen_Time_Before_Bed']
    df['Social_to_Edu_Ratio'] = (df['Time_on_Social_Media'] + 1) / (df['Time_on_Education'] + 1)
    df['Gaming_to_Social_Ratio'] = (df['Time_on_Gaming'] + 1) / (df['Time_on_Social_Media'] + 1)
    df['Weekend_Overuse'] = df['Weekend_Usage_Hours'] - df['Daily_Usage_Hours']
    df['Phone_Obsessiveness'] = df['Phone_Checks_Per_Day'] / (df['Apps_Used_Daily'] + 1)
    df['Sleep_Deficit'] = 8 - df['Sleep_Hours']
    print("\nEngineered new features.")
else:
    print("\nWarning: Some base columns for feature engineering are missing. Skipping feature engineering.")

# --- 5. Define features (X) and target (y) for Anxiety Model ---
# X includes all features EXCEPT Addiction_Level, and Depression_Level (as these are not direct inputs for Anxiety prediction)
# The order of these features MUST match 'features_for_sub_models' in your Streamlit app.
features_for_anxiety_model = [
    'Age', 'Daily_Usage_Hours', 'Sleep_Hours', 'Academic_Performance',
    'Social_Interactions', 'Exercise_Hours', 'Self_Esteem', 'Parental_Control',
    'Screen_Time_Before_Bed', 'Phone_Checks_Per_Day', 'Apps_Used_Daily',
    'Time_on_Social_Media', 'Time_on_Gaming', 'Time_on_Education',
    'Family_Communication', 'Weekend_Usage_Hours',
    'Gender_Male', 'Gender_Other',
    'School_Grade_11th', 'School_Grade_12th', 'School_Grade_7th',
    'School_Grade_8th', 'School_Grade_9th',
    'Phone_Usage_Purpose_Education', 'Phone_Usage_Purpose_Gaming',
    'Phone_Usage_Purpose_Other', 'Phone_Usage_Purpose_Social Media',
    'Night_Usage', 'Social_to_Edu_Ratio', 'Gaming_to_Social_Ratio',
    'Weekend_Overuse', 'Phone_Obsessiveness', 'Sleep_Deficit'
]

if 'Anxiety_Level' in df.columns and all(col in df.columns for col in features_for_anxiety_model):
    X_anxiety = df[features_for_anxiety_model]
    y_anxiety = df['Anxiety_Level']
    print(f"\nFeatures (X_anxiety) shape: {X_anxiety.shape}")
    print(f"Target (y_anxiety) shape: {y_anxiety.shape}")
    print("\nFeatures used for Anxiety model training (X_anxiety.columns.tolist()):")
    print(X_anxiety.columns.tolist()) # CRITICAL: Verify this list matches Streamlit app's 'features_for_sub_models'
else:
    print("Error: Required columns for Anxiety model not found in DataFrame.")
    exit()

# --- 6. Train/test split ---
X_train_anx, X_test_anx, y_train_anx, y_test_anx = train_test_split(X_anxiety, y_anxiety, test_size=0.2, random_state=42)
print(f"\nTraining data shape: {X_train_anx.shape}, Test data shape: {X_test_anx.shape}")

# --- 7. Train model ---
print("\nTraining XGBoost Regressor for Anxiety_Level...")
# Changed to XGBRegressor
anxiety_model = XGBRegressor(n_estimators=100, random_state=42)
anxiety_model.fit(X_train_anx, y_train_anx)
print("Anxiety model training complete.")

# --- 8. Evaluate model ---
preds_anx = anxiety_model.predict(X_test_anx)

r2_anx = r2_score(y_test_anx, preds_anx)
print(f"\nAnxiety Model Evaluation:")

print(f"  R-squared: {r2_anx:.2f}")

# --- 9. Save model ---
joblib.dump(anxiety_model, MODEL_OUTPUT_PATH)
print(f"\n✅ Anxiety model saved as '{MODEL_OUTPUT_PATH}'")

print("\n--- Anxiety Model Training Script Finished ---")
print("Next steps:")
print(f"1. The '{MODEL_OUTPUT_PATH}' file is now saved in your 'teenaddiction/model/' directory.")
print("2. Ensure this model is placed in the 'model/' directory of your Streamlit app.")
