In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

def encode_categorical(series, is_train=True, mapping=None):
    if is_train:
        unique_values = series.unique()
        mapping = {val: idx for idx, val in enumerate(unique_values)}
    max_val = max(mapping.values()) if mapping else 0
    return series.map(lambda x: mapping.get(x, max_val + 1)), mapping

def preprocess_data(df, is_train=True, encoders=None, scaler=None, pca=None):
    df = df.copy()
    
    # Separate target variable if exists
    target = None
    if 'Depression' in df.columns:
        target = df['Depression']
        df = df.drop('Depression', axis=1)
    
    # Drop unnecessary columns
    cols_to_drop = ['id', 'Name']
    df = df.drop([col for col in cols_to_drop if col in df.columns], axis=1)
    
    # Initialize encoders and scalers if training
    if is_train:
        encoders = {}
        scaler = StandardScaler()
        pca = PCA(n_components=0.95)  # Retain 95% variance
    
    # Process categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        df[column], mapping = encode_categorical(df[column].astype(str), is_train=is_train, mapping=encoders.get(column))
        if is_train:
            encoders[column] = mapping
    
    # Handle numeric variables
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    imputer = SimpleImputer(strategy='median')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Feature Engineering
    # 1. Polynomial Features
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    poly_features = poly.fit_transform(df[numeric_columns])
    poly_columns = [f'poly_{i}' for i in range(poly_features.shape[1])]
    df_poly = pd.DataFrame(poly_features, columns=poly_columns)
    df = pd.concat([df, df_poly], axis=1)
    
    # 2. Clustering Features
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    df['cluster'] = kmeans.fit_predict(df[numeric_columns])
    
    # 3. Ratio Features
    if {'SleepHrs', 'WorkHrs'}.issubset(df.columns):
        df['Sleep_Work_Ratio'] = df['SleepHrs'] / (df['WorkHrs'] + 1)
    
    # 4. Standard Scaling
    df_scaled = scaler.fit_transform(df)
    
    # 5. Dimensionality Reduction
    df_reduced = pca.fit_transform(df_scaled)
    df_final = pd.DataFrame(df_reduced)
    
    # Reattach target variable if present
    if target is not None:
        df_final['Depression'] = target.values
    
    return df_final, encoders, scaler, pca

def train_model(train_data):
    X = train_data.drop('Depression', axis=1)
    y = train_data['Depression']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 5,
        'learning_rate': 0.005,
        'subsample': 0.9,
        'colsample_bytree': 0.9,
        'n_estimators': 5000,
        'verbosity': 0,
        'early_stopping_rounds': 50,
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)
    val_predictions = model.predict(X_val)
    print("\nValidation Metrics:")
    print(classification_report(y_val, val_predictions))
    print(f"Validation Accuracy: {accuracy_score(y_val, val_predictions):.4f}")
    
    return model

def main():
    train_data = pd.read_csv('train.csv')
    test_data = pd.read_csv('test.csv')
    
    processed_train, encoders, scaler, pca = preprocess_data(train_data, is_train=True)
    model = train_model(processed_train)
    processed_test, _, _, _ = preprocess_data(test_data, is_train=False, encoders=encoders, scaler=scaler, pca=pca)
    
    train_cols = processed_train.drop('Depression', axis=1).columns
    processed_test = processed_test[train_cols]
    test_predictions = model.predict(processed_test)
    
    submission = pd.DataFrame({'id': test_data['id'], 'Depression': test_predictions})
    submission.to_csv('submission_xgboost.csv', index=False)
    
    feature_importance = pd.DataFrame({'feature': train_cols, 'importance': model.feature_importances_})
    print("\nTop 10 Most Important Features:")
    print(feature_importance.sort_values('importance', ascending=False).head(10))
    
    model.save_model('depression_prediction_xgboost.model')
    print("\nModel saved as: depression_prediction_xgboost.model")

if __name__ == "__main__":
    main()


[0]	validation_0-logloss:0.47728
[100]	validation_0-logloss:0.32039
[200]	validation_0-logloss:0.25159
[300]	validation_0-logloss:0.21537
[400]	validation_0-logloss:0.19471
[500]	validation_0-logloss:0.18240
[600]	validation_0-logloss:0.17474
[700]	validation_0-logloss:0.16975
[800]	validation_0-logloss:0.16646
[900]	validation_0-logloss:0.16411
[1000]	validation_0-logloss:0.16231
[1100]	validation_0-logloss:0.16088
[1200]	validation_0-logloss:0.15976
[1300]	validation_0-logloss:0.15887
[1400]	validation_0-logloss:0.15820
[1500]	validation_0-logloss:0.15761
[1600]	validation_0-logloss:0.15715
[1700]	validation_0-logloss:0.15678
[1800]	validation_0-logloss:0.15645
[1900]	validation_0-logloss:0.15616
[2000]	validation_0-logloss:0.15595
[2100]	validation_0-logloss:0.15577
[2200]	validation_0-logloss:0.15561
[2300]	validation_0-logloss:0.15546
[2400]	validation_0-logloss:0.15533
[2500]	validation_0-logloss:0.15524
[2600]	validation_0-logloss:0.15520
[2700]	validation_0-logloss:0.15514
[280




Model saved as: depression_prediction_xgboost.model
