In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFromModel
import optuna
import warnings
warnings.filterwarnings('ignore')

def encode_categorical(series, is_train=True, mapping=None):
    """
    Encodes categorical variables by mapping each unique value to an integer.
    
    Parameters:
    - series: The categorical column to encode.
    - is_train: If True, create a new mapping. If False, use the provided mapping.
    - mapping: Dictionary mapping categorical values to integers (used for test data).
    
    Returns:
    - Encoded series with integer values.
    - The mapping dictionary (useful for encoding test data consistently).
    """
    if is_train:
        # Create a mapping for unique values
        unique_values = series.unique()
        mapping = {val: idx for idx, val in enumerate(unique_values)}
    
    # Transform using mapping, assign max value + 1 for unseen categories
    max_val = max(mapping.values()) if mapping else 0
    return series.map(lambda x: mapping.get(x, max_val + 1)), mapping

def create_features(df):
    """Generate new features that might help the model."""
    df = df.copy()
    
    # Identify numeric columns for feature engineering
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    # Remove target if it exists in numeric_cols
    if 'Depression' in numeric_cols:
        numeric_cols.remove('Depression')
    
    # Create interaction features for important numeric columns
    # This will be helpful if there are relationships between variables
    if len(numeric_cols) >= 2:
        for i in range(len(numeric_cols)):
            for j in range(i+1, len(numeric_cols)):
                col1, col2 = numeric_cols[i], numeric_cols[j]
                # Multiplication interaction
                df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
                # Division interaction (with error handling)
                df[f'{col1}_div_{col2}'] = df[col1] / (df[col2] + 1e-5)  # Avoid division by zero
                # Addition
                df[f'{col1}_plus_{col2}'] = df[col1] + df[col2]
                # Subtraction
                df[f'{col1}_minus_{col2}'] = df[col1] - df[col2]
    
    # Square and cube transformations for numeric columns
    for col in numeric_cols:
        df[f'{col}_squared'] = df[col] ** 2
        df[f'{col}_cubed'] = df[col] ** 3
        df[f'{col}_log'] = np.log1p(np.abs(df[col]) + 1e-5)  # Log transform (with handling for zeros and negatives)
    
    return df

def preprocess_data(df, is_train=True, encoders=None, scaler=None, feature_selector=None):
    # Create a copy of the dataframe
    df = df.copy()
    
    # Store target variable if exists
    target = None
    if 'Depression' in df.columns and is_train:
        target = df['Depression'].copy()
    
    # Drop ID and Name columns
    cols_to_drop = ['id', 'Name']
    if not is_train and 'Depression' in df.columns:
        # For test data, also drop the Depression column if it exists
        cols_to_drop.append('Depression')
    df = df.drop([col for col in cols_to_drop if col in df.columns], axis=1)
    
    # Initialize encoders dictionary if training
    if is_train:
        encoders = {}
    
    # Process categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        df[column], mapping = encode_categorical(
            df[column].astype(str),
            is_train=is_train,
            mapping=encoders.get(column)
        )
        if is_train:
            encoders[column] = mapping
    
    # Handle missing values in numeric variables
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    if len(numeric_columns) > 0:
        if is_train:
            imputer = SimpleImputer(strategy='median')
            imputer.fit(df[numeric_columns])
        df[numeric_columns] = imputer.transform(df[numeric_columns])
    
    # Create additional features
    df = create_features(df)
    
    # Standardize features
    if is_train:
        scaler = StandardScaler()
        scaler.fit(df)
    
    scaled_df = pd.DataFrame(scaler.transform(df), columns=df.columns, index=df.index)
    
    # Apply feature selection if needed and if in training mode
    if is_train and feature_selector is None:
        # Train a basic model for feature selection
        temp_model = XGBClassifier(n_estimators=100, random_state=42)
        temp_model.fit(scaled_df, target)
        
        # Select important features
        feature_selector = SelectFromModel(temp_model, threshold='median')
        feature_selector.fit(scaled_df, target)
    
    if feature_selector is not None:
        feature_mask = feature_selector.get_support()
        selected_features = scaled_df.columns[feature_mask]
        scaled_df = scaled_df[selected_features]
    
    # Add back the target for training data
    if is_train and target is not None:
        scaled_df['Depression'] = target
    
    return scaled_df, encoders, scaler, feature_selector

def objective(trial, X_train, y_train, X_val, y_val):
    """Optuna objective function for XGBoost hyperparameter tuning."""
    
    # XGBoost parameters to optimize
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 100, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 100, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 10, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
        'early_stopping_rounds':5,
    }
    
    # Train model
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, 
              eval_set=[(X_val, y_val)],
              
              verbose=False)
    
    # Predict and evaluate
    preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)
    
    return auc

def train_optimized_model(train_data):
    # Separate features and target
    X = train_data.drop('Depression', axis=1)
    y = train_data['Depression']
    
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Hyperparameter optimization with Optuna
    print("Optimizing hyperparameters...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=5)
    
    best_params = study.best_params
    print(f"Best parameters: {best_params}")
    
    # Train XGBoost with best parameters
    xgb_model = XGBClassifier(
        objective='binary:logistic',
        **best_params,
        random_state=42
    )
    
    # Train LightGBM model for ensemble
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'random_state': 42
    }
    
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    
    # Create voting ensemble
    ensemble_model = VotingClassifier(
        estimators=[
            ('xgb', xgb_model),
            ('lgb', lgb_model)
        ],
        voting='soft'
    )
    
    # Train the ensemble on the full training data
    print("Training ensemble model...")
    ensemble_model.fit(X, y)
    
    # Evaluate model
    val_predictions = ensemble_model.predict(X_val)
    print("\nValidation Metrics:")
    print(classification_report(y_val, val_predictions))
    print(f"Validation Accuracy: {accuracy_score(y_val, val_predictions):.4f}")
    
    # Cross-validation to get a more robust estimate
    print("\nPerforming cross-validation...")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_cv_train, X_cv_val = X.iloc[train_idx], X.iloc[val_idx]
        y_cv_train, y_cv_val = y.iloc[train_idx], y.iloc[val_idx]
        
        cv_model = XGBClassifier(**best_params, random_state=42)
        cv_model.fit(X_cv_train, y_cv_train)
        
        y_cv_pred = cv_model.predict(X_cv_val)
        cv_score = accuracy_score(y_cv_val, y_cv_pred)
        cv_scores.append(cv_score)
    
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")
    
    return ensemble_model

def main():
    try:
        # Load data
        print("Loading data...")
        train_data = pd.read_csv('train.csv')
        test_data = pd.read_csv('test.csv')
        
        # Print initial column names for debugging
        print("\nTraining data columns:", train_data.columns.tolist())
        print("Test data columns:", test_data.columns.tolist())
        
        # Preprocess training data
        print("\nPreprocessing data with advanced feature engineering...")
        processed_train, encoders, scaler, feature_selector = preprocess_data(train_data, is_train=True)
        
        # Train optimized model
        print("\nTraining optimized model...")
        model = train_optimized_model(processed_train)
        
        # Process test data using the same transformations
        processed_test, _, _, _ = preprocess_data(
            test_data, 
            is_train=False, 
            encoders=encoders,
            scaler=scaler,
            feature_selector=feature_selector
        )
        
        # Ensure columns match between train and test
        train_cols = processed_train.drop('Depression', axis=1).columns
        if not all(col in processed_test.columns for col in train_cols):
            missing_cols = [col for col in train_cols if col not in processed_test.columns]
            print(f"Warning: Missing columns in test data: {missing_cols}")
            # Add missing columns with zeros
            for col in missing_cols:
                processed_test[col] = 0
        
        processed_test = processed_test[train_cols]
        
        # Make predictions on test data
        print("\nMaking predictions on test data...")
        test_predictions = model.predict(processed_test)
        
        # Create submission file
        submission = pd.DataFrame({
            'id': test_data['id'],
            'Depression': test_predictions
        })
        submission.to_csv('submission6.csv', index=False)
        print("\nSubmission file created: submission_optimized.csv")
        
        # Print feature importance (if available)
        if hasattr(model, 'named_estimators_') and hasattr(model.named_estimators_['xgb'], 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': train_cols,
                'importance': model.named_estimators_['xgb'].feature_importances_
            })
            print("\nTop 10 Most Important Features:")
            print(feature_importance.sort_values('importance', ascending=False).head(10))
        
    except Exception as e:
       S print(f"An error occurred: {str(e)}")
        import traceback
        print("\nDetailed error information:")
        print(traceback.format_exc())

if __name__ == "__main__":
    main()

SyntaxError: invalid syntax (2703949019.py, line 314)