In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('/kaggle/input/mc-datathon-2025-phone-addiction/train.csv')
test_df = pd.read_csv('/kaggle/input/mc-datathon-2025-phone-addiction/test.csv')

# --- Feature Engineering ---
def advanced_feature_engineering(df):
    df = df.copy()
    df["usage_sleep_ratio"] = df["Daily_Usage_Hours"] / (df["Sleep_Hours"] + 0.1)
    df["weekend_weekday_ratio"] = df["Weekend_Usage_Hours"] / (df["Daily_Usage_Hours"] + 0.1)
    df["distraction_time"] = df["Time_on_Gaming"] + df["Time_on_Social_Media"]
    df["focus_ratio"] = df["Time_on_Education"] / (df["distraction_time"] + 0.1)
    df["total_screen_time"] = df["Time_on_Social_Media"] + df["Time_on_Gaming"] + df["Time_on_Education"]
    df["mental_health_score"] = df["Anxiety_Level"] + df["Depression_Level"] - df["Self_Esteem"]
    df["social_academic_balance"] = df["Social_Interactions"] * df["Academic_Performance"] / 100
    df["usage_per_app"] = df["Daily_Usage_Hours"] / (df["Apps_Used_Daily"] + 0.1)
    df["checks_per_hour"] = df["Phone_Checks_Per_Day"] / 24
    df["intensive_usage"] = (df["Phone_Checks_Per_Day"] > 100).astype(int)
    df["sleep_screen_conflict"] = df["Screen_Time_Before_Bed"] * df["Daily_Usage_Hours"]
    df["sleep_deficit"] = np.maximum(0, 8 - df["Sleep_Hours"])
    df["age_usage_interaction"] = df["Age"] * df["Daily_Usage_Hours"]
    df["teenage_peak"] = ((df["Age"] >= 15) & (df["Age"] <= 17)).astype(int)
    df["high_anxiety_low_esteem"] = ((df["Anxiety_Level"] > 7) & (df["Self_Esteem"] < 4)).astype(int)
    df["social_isolation"] = (df["Social_Interactions"] < 3).astype(int)
    df["wellness_score"] = df["Exercise_Hours"] + df["Sleep_Hours"] - df["Anxiety_Level"]
    df["lifestyle_balance"] = (df["Exercise_Hours"] + df["Sleep_Hours"]) / (df["Daily_Usage_Hours"] + 0.1)
    return df

def preprocess_data(train_df, test_df):
    test_ids = test_df.get('id', range(len(test_df)))
    columns_to_drop = ['id', 'Location', 'Name']
    train_df = train_df.drop(columns=[c for c in columns_to_drop if c in train_df.columns])
    test_df = test_df.drop(columns=[c for c in columns_to_drop if c in test_df.columns])
    
    if 'School_Grade' in train_df.columns:
        train_df['Grade_Numeric'] = train_df['School_Grade'].str.extract('(\d+)').astype(float)
        test_df['Grade_Numeric'] = test_df['School_Grade'].str.extract('(\d+)').astype(float)
        train_df = train_df.drop('School_Grade', axis=1)
        test_df = test_df.drop('School_Grade', axis=1)

    train_df = advanced_feature_engineering(train_df)
    test_df = advanced_feature_engineering(test_df)

    categorical_columns = ['Gender', 'Phone_Usage_Purpose']
    if 'Parental_Control' in train_df.columns:
        categorical_columns.append('Parental_Control')

    for col in categorical_columns:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col].astype(str))
        test_df[col] = test_df[col].astype(str).apply(lambda x: x if x in le.classes_ else le.classes_[0])
        test_df[col] = le.transform(test_df[col])

    corr_matrix = train_df.corr()
    target_corr = corr_matrix["Addiction_Level"].abs()
    important_features = ['usage_sleep_ratio', 'mental_health_score', 'focus_ratio', 
                          'weekend_weekday_ratio', 'social_academic_balance']
    low_corr_cols = target_corr[target_corr < 0.1].index.tolist()
    low_corr_cols = [col for col in low_corr_cols if col not in important_features and col != 'Addiction_Level']
    
    train_df = train_df.drop(columns=low_corr_cols)
    test_df = test_df.drop(columns=[c for c in low_corr_cols if c in test_df.columns])
    
    return train_df, test_df, test_ids

# Preprocess
train_processed, test_processed, test_ids = preprocess_data(train_df.copy(), test_df.copy())
X = train_processed.drop("Addiction_Level", axis=1)
y = train_processed["Addiction_Level"]
y_log = np.log1p(y)

# Scale
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_processed)

# Select features
selector = SelectKBest(score_func=f_regression, k=20)
X_selected = selector.fit_transform(X_scaled, y_log)
test_selected = selector.transform(test_scaled)
feature_names = np.array(X.columns)[selector.get_support()]

# KFold stacking
def generate_meta_features(models, X, y, X_test, n_folds=5):
    meta_train = np.zeros((X.shape[0], len(models)))
    meta_test = np.zeros((X_test.shape[0], len(models)))
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    for i, (name, model) in enumerate(models.items()):
        print(f"Fitting {name}...")
        test_preds = np.zeros((X_test.shape[0], n_folds))
        for j, (train_idx, val_idx) in enumerate(kf.split(X, y)):
            model_clone = clone(model)
            model_clone.fit(X[train_idx], y[train_idx])
            meta_train[val_idx, i] = model_clone.predict(X[val_idx])
            test_preds[:, j] = model_clone.predict(X_test)
        meta_test[:, i] = test_preds.mean(axis=1)
    return meta_train, meta_test

# Base models
base_models = {
    'xgb': XGBRegressor(n_estimators=600, max_depth=4, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42),
    'lgb': LGBMRegressor(n_estimators=600, max_depth=4, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42),
    'gb': GradientBoostingRegressor(n_estimators=400, max_depth=4, learning_rate=0.05, subsample=0.8, random_state=42),
    'cat': CatBoostRegressor(iterations=600, learning_rate=0.03, depth=6, verbose=0, random_seed=42)
}

meta_train, meta_test = generate_meta_features(base_models, X_selected, y_log.values, test_selected)

# Meta model
meta_model = Ridge(alpha=1.0)
meta_model.fit(meta_train, y_log)
mse = mean_squared_error(y_log, meta_model.predict(meta_train))
print(f"📉 Final stacked model log-MSE: {mse:.6f}")

# Predict and submit
final_predictions_log = meta_model.predict(meta_test)
final_predictions = np.expm1(final_predictions_log)
submission = pd.DataFrame({'id': test_ids, 'Addiction_Level': final_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission saved as 'submission.csv'")

# Feature importance
cat_model = base_models['cat']
cat_model.fit(X_selected, y_log)
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': cat_model.get_feature_importance()
}).sort_values(by='importance', ascending=False)

print("\nTop 10 CatBoost Features:")
print(importance_df.head(10).to_string(index=False))
