In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from sklearn.impute import SimpleImputer

def encode_categorical(series, is_train=True, mapping=None):
    if is_train:
        # Create a mapping for unique values
        unique_values = series.unique()
        mapping = {val: idx for idx, val in enumerate(unique_values)}
    
    # Transform using mapping, assign max value + 1 for unseen categories
    max_val = max(mapping.values()) if mapping else 0
    return series.map(lambda x: mapping.get(x, max_val + 1)), mapping

def preprocess_data(df, is_train=True, encoders=None):
    # Create a copy of the dataframe
    df = df.copy()
    
    # Drop ID and Name columns
    cols_to_drop = ['id', 'Name']
    if not is_train:
        # For test data, also drop the Depression column if it exists
        cols_to_drop.append('Depression')
    df = df.drop([col for col in cols_to_drop if col in df.columns], axis=1)
    
    # Initialize encoders dictionary if training
    if is_train:
        encoders = {}
    
    # Process categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        if column != 'Depression':
            df[column], mapping = encode_categorical(
                df[column].astype(str),
                is_train=is_train,
                mapping=encoders.get(column)
            )
            if is_train:
                encoders[column] = mapping
    
    # Handle numeric variables
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    if len(numeric_columns) > 0:
        imputer = SimpleImputer(strategy='median')
        df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Convert all columns to float
    for col in df.columns:
        df[col] = df[col].astype(float)
    
    return df, encoders

def train_model(train_data):
    # Separate features and target
    X = train_data.drop('Depression', axis=1)
    y = train_data['Depression']
    
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # XGBoost parameters (similar to original LightGBM parameters)
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 5,
        'learning_rate': 0.01,
        'subsample': 0.9,
        'colsample_bytree': 0.9,
        'n_estimators': 2000,
        'verbosity': 0,
        'early_stopping_rounds': 40,
    }
    
    # Train model
    model = xgb.XGBClassifier(**params)
    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
        verbose=100
    )
    
    # Evaluate model
    val_predictions = model.predict(X_val)
    print("\nValidation Metrics:")
    print(classification_report(y_val, val_predictions))
    print(f"Validation Accuracy: {accuracy_score(y_val, val_predictions):.4f}")
    
    return model

def main():
    try:
        # Load data
        print("Loading and preprocessing data...")
        train_data = pd.read_csv('train.csv')
        test_data = pd.read_csv('test.csv')
        
        # Print initial column names for debugging
        print("\nTraining data columns:", train_data.columns.tolist())
        print("Test data columns:", test_data.columns.tolist())
        
        # Preprocess training data
        processed_train, encoders = preprocess_data(train_data, is_train=True)
        
        # Train model
        print("\nTraining model...")
        model = train_model(processed_train)
        
        # Process test data using the same encoders
        processed_test, _ = preprocess_data(test_data, is_train=False, encoders=encoders)
        
        # Ensure columns match between train and test
        train_cols = processed_train.drop('Depression', axis=1).columns
        processed_test = processed_test[train_cols]
        
        # Make predictions on test data
        print("\nMaking predictions on test data...")
        test_predictions = model.predict(processed_test)
        
        # Create submission file
        submission = pd.DataFrame({
            'id': test_data['id'],
            'Depression': test_predictions
        })
        submission.to_csv('submission7.csv', index=False)
        print("\nSubmission file created: submission_xgboost.csv")
        
        # Print feature importance
        feature_importance = pd.DataFrame({
            'feature': train_cols,
            'importance': model.feature_importances_
        })
        print("\nTop 10 Most Important Features:")
        print(feature_importance.sort_values('importance', ascending=False).head(10))
        
        # Save model
        model.save_model('depression_prediction_xgboost.model')
        print("\nModel saved as: depression_prediction_xgboost.model")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        import traceback
        print("\nDetailed error information:")
        print(traceback.format_exc())

if __name__ == "__main__":
    main()

Loading and preprocessing data...

Training data columns: ['id', 'Name', 'Gender', 'Age', 'City', 'Working Professional or Student', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Depression']
Test data columns: ['id', 'Name', 'Gender', 'Age', 'City', 'Working Professional or Student', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness']

Training model...
[0]	validation_0-logloss:0.47469
[100]	validation_0-logloss:0.25120
[200]	validation_0-logloss:0.19236
[300]	validation_0-logloss:0.17090
[400]	validation_0-logloss:0.16153
[500]	validation_0-logloss:0.15668
[6