In [None]:
# -*- coding: utf-8 -*-
"""train_model.ipynb

This notebook will preprocess the teen_phone_addiction_dataset.csv,
engineer new features, and then train a RandomForestRegressor model.
Finally, it will save the trained model as 'addiction_model.pkl'.
"""

# 1. Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from sklearn.preprocessing import LabelEncoder # For initial categorical encoding if needed, or for reference

# --- Configuration ---
# Adjust this path based on where your notebook is relative to the CSV file.
# Assuming notebook is in 'teenaddiction/model/' and CSV is in 'teenaddiction/data/'
DATASET_PATH = 'D:\c++\teenaddiction\data\teen_phone_addiction_dataset.csv'
MODEL_OUTPUT_PATH = 'D:\c++\teenaddiction\model\addiction_model.pkl' # Model will be saved in the same directory as the notebook

# --- 2. Load your dataset ---
try:
    df = pd.read_csv(DATASET_PATH)
    print("Dataset loaded successfully.")
    print(f"Initial shape: {df.shape}")
    print("Initial columns:", df.columns.tolist())
except FileNotFoundError:
    print(f"Error: Dataset not found at {DATASET_PATH}.")
    print("Please ensure 'teen_phone_addiction_dataset.csv' is in the 'data/' directory.")
    # Exit or handle the error appropriately
    exit()

# --- 3. Data Cleaning and Preprocessing ---

# Define columns to drop (identifiers and potentially redundant/leaky target-related columns)
columns_to_drop_initial = ["ID", "Name", "Location", "Depression_Class", "Depression_Level_Class", "Anxiety_Class", "Usage_Bin", "Usage_Group"]
# Filter to ensure we only try to drop columns that actually exist
existing_columns_to_drop = [col for col in columns_to_drop_initial if col in df.columns]

if existing_columns_to_drop:
    df.drop(columns=existing_columns_to_drop, inplace=True)
    print(f"\nDropped columns: {existing_columns_to_drop}")
else:
    print("\nNo specified initial columns to drop were found in the DataFrame (or already dropped).")

# Handle missing values (dropping rows for simplicity; consider imputation for larger datasets)
print("\nMissing values before dropping NaNs:\n", df.isnull().sum())
df.dropna(inplace=True)
print("\nMissing values after dropping NaNs:\n", df.isnull().sum())

# Encode categorical columns using One-Hot Encoding
# These are the original categorical columns that need to be transformed
categorical_cols_to_encode = ['Gender', 'School_Grade', 'Phone_Usage_Purpose']
existing_categorical_cols = [col for col in categorical_cols_to_encode if col in df.columns]

if existing_categorical_cols:
    # Using pd.get_dummies for one-hot encoding
    # drop_first=True to avoid multicollinearity
    df = pd.get_dummies(df, columns=existing_categorical_cols, drop_first=True)
    print(f"\nOne-hot encoded columns: {existing_categorical_cols}")
else:
    print("\nNo specified categorical columns to encode were found.")

# --- 4. Feature Engineering ---
# Ensure the base columns for feature engineering exist in the DataFrame
# Check for presence of all required base columns before engineering
required_base_cols = [
    'Daily_Usage_Hours', 'Screen_Time_Before_Bed', 'Time_on_Social_Media',
    'Time_on_Education', 'Time_on_Gaming', 'Weekend_Usage_Hours',
    'Phone_Checks_Per_Day', 'Apps_Used_Daily', 'Sleep_Hours'
]

if all(col in df.columns for col in required_base_cols):
    df['Night_Usage'] = df['Daily_Usage_Hours'] - df['Screen_Time_Before_Bed']
    # Add +1 to denominators to prevent division by zero for ratio features
    df['Social_to_Edu_Ratio'] = (df['Time_on_Social_Media'] + 1) / (df['Time_on_Education'] + 1)
    df['Gaming_to_Social_Ratio'] = (df['Time_on_Gaming'] + 1) / (df['Time_on_Social_Media'] + 1)
    df['Weekend_Overuse'] = df['Weekend_Usage_Hours'] - df['Daily_Usage_Hours']
    df['Phone_Obsessiveness'] = df['Phone_Checks_Per_Day'] / (df['Apps_Used_Daily'] + 1)
    df['Sleep_Deficit'] = 8 - df['Sleep_Hours'] # Assuming 8 hours is ideal sleep
    print("\nEngineered new features.")
else:
    print("\nWarning: Some base columns for feature engineering are missing. Skipping feature engineering.")
    print("Missing base columns:", [col for col in required_base_cols if col not in df.columns])


# --- 5. Define features (X) and target (y) ---
# The target variable is 'Addiction_Level'
if 'Addiction_Level' in df.columns:
    X = df.drop(columns=['Addiction_Level'])
    y = df['Addiction_Level']
    print(f"\nFeatures (X) shape: {X.shape}")
    print(f"Target (y) shape: {y.shape}")
    print("\nFeatures used for training (X.columns.tolist()):")
    print(X.columns.tolist()) # This list is crucial for your Streamlit app
else:
    print("Error: 'Addiction_Level' column not found in the DataFrame. Cannot define target.")
    exit()

# --- 6. Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining data shape: {X_train.shape}, Test data shape: {X_test.shape}")

# --- 7. Train model ---
print("\nTraining RandomForestRegressor model...")
model = RandomForestRegressor(n_estimators=100, random_state=42) # You can adjust n_estimators
model.fit(X_train, y_train)
print("Model training complete.")

# --- 8. Evaluate model ---
preds = model.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False) # RMSE
r2 = r2_score(y_test, preds) # R-squared
print(f"\nModel Evaluation:")
print(f"  RMSE: {rmse:.2f}")
print(f"  R-squared: {r2:.2f}")

# --- 9. Save model ---
joblib.dump(model, MODEL_OUTPUT_PATH)
print(f"\n✅ Model saved as '{MODEL_OUTPUT_PATH}'")

print("\n--- Training Script Finished ---")
print("Next steps:")
print(f"1. The '{MODEL_OUTPUT_PATH}' file is now saved in your 'teenaddiction/model/' directory.")
print(f"2. Ensure your Streamlit app ('app/app.py') uses the exact same feature list and order for prediction.")
print("   The feature list your model expects is printed above (X.columns.tolist()).")
