In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def load_and_preprocess_data(train_path, test_path):
    """Load and preprocess the training and test datasets."""
    # Load data
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # Store test IDs for submission
    test_ids = test_df['id'].copy()
    
    # Function to preprocess a single dataframe
    def preprocess_df(df):
        # Convert age to numeric, removing any non-numeric characters
        df['Age'] = pd.to_numeric(df['Age'].astype(str).str.replace('[^0-9.]', ''), errors='coerce')
        
        # Handle missing values in CGPA
        df['CGPA'] = pd.to_numeric(df['CGPA'].astype(str).str.replace('[^0-9.]', ''), errors='coerce')
        
        # Create binary features for Working Professional/Student
        df['is_working_professional'] = (df['Working Professional or Student'] == 'Working Professional').astype(int)
        
        # Create features from city
        df['is_metro_city'] = df['City'].isin(['Mumbai', 'Delhi', 'Bangalore', 'Kolkata', 'Chennai', 'Hyderabad']).astype(int)
        
        # Encode gender
        df['gender_encoded'] = (df['Gender'] == 'Male').astype(int)
        
        # Create profession categories
        df['is_teacher'] = df['Profession'].str.contains('Teacher|Professor|Educator', na=False, case=False).astype(int)
        df['is_tech'] = df['Profession'].str.contains('Engineer|Developer|Programmer|IT', na=False, case=False).astype(int)
        df['is_medical'] = df['Profession'].str.contains('Doctor|Nurse|Medical|Healthcare', na=False, case=False).astype(int)
        
        # Calculate pressure score (combined academic and work pressure)
        df['total_pressure'] = df['Academic Pressure'] + df['Work Pressure']
        
        # Select features for model
        features = ['Age', 'gender_encoded', 'is_working_professional', 'is_metro_city',
                   'is_teacher', 'is_tech', 'is_medical', 'Academic Pressure', 
                   'Work Pressure', 'total_pressure', 'CGPA']
        
        return df[features]
    
    # Preprocess both datasets
    X_train = preprocess_df(train_df)
    X_test = preprocess_df(test_df)
    
    # Get target variable for training data
    y_train = train_df['Depression']
    
    return X_train, y_train, X_test, test_ids

def create_model():
    """Create and return the model pipeline."""
    # Create pipeline with imputer, scaler, and model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        ))
    ])
    
    return pipeline

def train_and_evaluate_model(X_train, y_train, model):
    """Train the model and evaluate its performance using cross-validation."""
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Train final model on full training data
    model.fit(X_train, y_train)
    
    return model

def generate_submission(model, X_test, test_ids, submission_path):
    """Generate submission file with predictions."""
    # Make predictions on test set
    predictions = model.predict(X_test)
    
    # Create submission dataframe
    submission_df = pd.DataFrame({
        'id': test_ids,
        'Depression': predictions
    })
    
    # Save to CSV
    submission_df.to_csv(submission_path, index=False)
    print(f"Submission file saved to {submission_path}")

def main():
    # File paths
    train_path = 'train.csv'
    test_path = 'test.csv'
    submission_path = 'submission.csv'
    
    # Load and preprocess data
    print("Loading and preprocessing data...")
    X_train, y_train, X_test, test_ids = load_and_preprocess_data(train_path, test_path)
    
    # Create and train model
    print("Creating and training model...")
    model = create_model()
    trained_model = train_and_evaluate_model(X_train, y_train, model)
    
    # Generate submission file
    print("Generating submission file...")
    generate_submission(trained_model, X_test, test_ids, submission_path)

if __name__ == "__main__":
    main()

Loading and preprocessing data...
Creating and training model...




Cross-validation scores: [0.89889837 0.90042644 0.89861407 0.8996091  0.90024876]
Mean CV accuracy: 0.8996 (+/- 0.0014)




Generating submission file...




Submission file saved to submission.csv
