In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, RocCurveDisplay

# Load the dataset
try:
    df = pd.read_csv('mental_health_analysis.csv')
    print("Dataset loaded successfully.")
    print("Dataset head:")
    print(df.head())
    print("\nDataset info:")
    df.info()
except FileNotFoundError:
    print("Error: mental_health_analysis.csv not found. Please ensure the file is in the correct directory.")
    exit()

# --- 1. Data Loading and Preprocessing ---

# Identify target variable (assuming 'mental_health_condition' as target for classification)
# If your target variable is different, please adjust 'target_variable'.
# If it's a regression task, specify that as well.
target_variable = 'mental_health_condition' # This is an assumption, please adjust if needed

# Check if the target variable exists in the DataFrame
if target_variable not in df.columns:
    print(f"Error: Target variable '{target_variable}' not found in the dataset columns.")
    print(f"Available columns: {df.columns.tolist()}")
    print("Please specify the correct target variable.")
    exit()

# Separate features (X) and target (y)
X = df.drop(columns=[target_variable])
y = df[target_variable]

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['number']).columns

# Preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() > 2 else None)
# stratify is used for classification to maintain class distribution

print(f"\nTraining set shape: {X_train.shape}, Test set shape: {X_test.shape}")

# --- 2. Baseline Model Training ---
print("\n--- Baseline Model Training (RandomForestClassifier) ---")

# Define the baseline model pipeline
# Using RandomForestClassifier as an example.
# If you used a different model (e.g., LogisticRegression, SVC) in Task 3, replace it here.
baseline_model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                          ('classifier', RandomForestClassifier(random_state=42))])

# Train the baseline model
baseline_model_pipeline.fit(X_train, y_train)

# Evaluate baseline model
y_pred_baseline = baseline_model_pipeline.predict(X_test)
print("\nBaseline Model Classification Report:")
print(classification_report(y_test, y_pred_baseline))

# Store baseline performance for comparison
baseline_report = classification_report(y_test, y_pred_baseline, output_dict=True)
baseline_accuracy = baseline_report['accuracy']
print(f"Baseline Model Accuracy: {baseline_accuracy:.4f}")

# --- 3. Hyperparameter Tuning (GridSearchCV) ---
print("\n--- Hyperparameter Tuning (GridSearchCV for RandomForestClassifier) ---")

# Define the model to be tuned (same as baseline, but now with tuning)
tuned_model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                        ('classifier', RandomForestClassifier(random_state=42))])

# Define the hyperparameter grid for RandomForestClassifier
# You can customize these parameters based on your understanding of the model and dataset
param_grid = {
    'classifier__n_estimators': [100, 200, 300], # Number of trees in the forest
    'classifier__max_features': ['sqrt', 'log2'], # Number of features to consider when looking for the best split
    'classifier__max_depth': [10, 20, 30, None], # Maximum number of levels in a tree
    'classifier__min_samples_split': [2, 5, 10], # Minimum number of data points placed in a node before the node is split
    'classifier__min_samples_leaf': [1, 2, 4] # Minimum number of data points allowed in a leaf node
}

# Set up GridSearchCV
# n_jobs=-1 uses all available CPU cores for parallel processing
grid_search = GridSearchCV(tuned_model_pipeline, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

print("Starting GridSearchCV...")
grid_search.fit(X_train, y_train)

print("\nGridSearchCV complete.")
print(f"Best hyperparameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# --- 4. Improved Model Training with Best Hyperparameters ---
print("\n--- Improved Model Training with Best Hyperparameters ---")

optimized_model = grid_search.best_estimator_
y_pred_optimized = optimized_model.predict(X_test)

print("\nOptimized Model Classification Report:")
print(classification_report(y_test, y_pred_optimized))

# Store optimized performance
optimized_report = classification_report(y_test, y_pred_optimized, output_dict=True)
optimized_accuracy = optimized_report['accuracy']
print(f"Optimized Model Accuracy: {optimized_accuracy:.4f}")

# --- 5. Advanced Evaluation Metrics and Visualizations ---
print("\n--- Advanced Evaluation Metrics and Visualizations ---")

# 5.1. Confusion Matrix
print("\nConfusion Matrix (Optimized Model):")
cm = confusion_matrix(y_test, y_pred_optimized)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=optimized_model.classes_, yticklabels=optimized_model.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Optimized Model')
plt.show()

# 5.2. ROC Curve and AUC (for binary classification)
# If your target variable has more than 2 unique values, this section needs adjustment for multi-class ROC.
# For simplicity, if it's multi-class, we'll focus on macro/weighted average AUC from classification report.
if len(np.unique(y)) == 2:
    print("\nROC Curve (Optimized Model):")
    y_prob_optimized = optimized_model.predict_proba(X_test)[:, 1] # Probability of the positive class

    fpr, tpr, thresholds = roc_curve(y_test, y_prob_optimized, pos_label=optimized_model.classes_[1])
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

    # Alternative using RocCurveDisplay (more modern approach)
    print("\nROC Curve using RocCurveDisplay (Optimized Model):")
    fig, ax = plt.subplots(figsize=(8, 6))
    RocCurveDisplay.from_estimator(optimized_model, X_test, y_test, ax=ax, name='Optimized Model')
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='navy', label='Chance', alpha=.8)
    ax.set_title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

else:
    print("\nSkipping ROC Curve: ROC curve is typically for binary classification. For multi-class, consider one-vs-rest or micro/macro averaging AUC.")


# 5.3. Residual Plots (if it were a regression task)
# This part is commented out as mental health analysis is often classification.
# If your task is regression, uncomment and adapt.
# if is_regression_task: # You'd define this flag earlier based on your target variable type
#     print("\nResidual Plots (Optimized Model - if regression):")
#     # Assuming 'y_pred_optimized' are continuous predictions and 'y_test' is continuous
#     residuals = y_test - y_pred_optimized
#     plt.figure(figsize=(10, 6))
#     sns.scatterplot(x=y_pred_optimized, y=residuals)
#     plt.axhline(y=0, color='r', linestyle='--')
#     plt.xlabel('Predicted Values')
#     plt.ylabel('Residuals')
#     plt.title('Residual Plot')
#     plt.show()
#
#     plt.figure(figsize=(8, 6))
#     sns.histplot(residuals, kde=True)
#     plt.xlabel('Residuals')
#     plt.ylabel('Frequency')
#     plt.title('Distribution of Residuals')
#     plt.show()


# 5.4. Time-series prediction plots (if applicable)
# This section assumes a time-series component, which is not directly evident from `mental_health_analysis.csv`.
# If your data has a 'Date' or 'Time' column and you are performing time-series forecasting,
# you would:
# 1. Ensure your train/test split respects the time order.
# 2. Plot actual vs. predicted values over time.
# Example (conceptual):
# if 'Date' in df.columns: # And you've done time-series splitting
#     # Assuming 'X_test_original' stores the original index/time from X_test
#     # and y_pred_optimized aligns with X_test_original's time order
#     plt.figure(figsize=(12, 6))
#     plt.plot(X_test_original['Date'], y_test, label='Actual')
#     plt.plot(X_test_original['Date'], y_pred_optimized, label='Predicted')
#     plt.xlabel('Date')
#     plt.ylabel('Mental Health Condition')
#     plt.title('Time Series Prediction (Optimized Model)')
#     plt.legend()
#     plt.xticks(rotation=45)
#     plt.tight_layout()
#     plt.show()
print("\nTime-series prediction plots are not generated as no time-series component was explicitly identified in the dataset for forecasting.")


# --- 6. Comparison of Baseline Model Performance with Optimized Model ---
print("\n--- Comparison of Baseline vs. Optimized Model Performance ---")

metrics_to_compare = ['accuracy', 'precision', 'recall', 'f1-score'] # You can add more

print(f"{'Metric':<15} {'Baseline':<15} {'Optimized':<15}")
print("-" * 45)

# For classification metrics (assuming 'weighted avg' for multi-class or overall accuracy)
for metric in metrics_to_compare:
    if metric == 'accuracy':
        baseline_value = baseline_accuracy
        optimized_value = optimized_accuracy
    else:
        # For precision, recall, f1-score, we often look at 'weighted avg' or 'macro avg' for multi-class
        # For simplicity, taking 'weighted avg' if available, else 'accuracy'
        baseline_value = baseline_report['weighted avg'][metric] if 'weighted avg' in baseline_report and metric in baseline_report['weighted avg'] else np.nan
        optimized_value = optimized_report['weighted avg'][metric] if 'weighted avg' in optimized_report and metric in optimized_report['weighted avg'] else np.nan

    print(f"{metric:<15} {baseline_value:<15.4f} {optimized_value:<15.4f}")

# Visual comparison (Bar Chart)
labels = ['Baseline', 'Optimized']
accuracy_scores = [baseline_accuracy, optimized_accuracy]

x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots(figsize=(8, 6))
rects1 = ax.bar(x, accuracy_scores, width, label='Accuracy', color=['skyblue', 'lightcoral'])

ax.set_ylabel('Accuracy')
ax.set_title('Model Accuracy Comparison: Baseline vs. Optimized')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
plt.tight_layout()
plt.show()


print("\n--- Task 4: Model Optimization & Advanced Evaluation Complete ---")

Dataset loaded successfully.
Dataset head:
   User_ID  Age Gender  Social_Media_Hours  Exercise_Hours  Sleep_Hours  \
0        1   16      F            9.654486        2.458001     5.198926   
1        2   17      M            9.158143        0.392095     8.866097   
2        3   15      M            5.028755        0.520119     4.943095   
3        4   17      F            7.951103        1.022630     5.262773   
4        5   17      F            1.357459        1.225462     6.196080   

   Screen_Time_Hours  Survey_Stress_Score  Wearable_Stress_Score  \
0           8.158189                    3               0.288962   
1           5.151993                    5               0.409446   
2           9.209325                    2               0.423837   
3           9.823658                    5               0.666021   
4          11.338990                    5               0.928060   

  Support_System Academic_Performance  
0       Moderate            Excellent  
1       Moderate 

KeyError: "['mental_health_condition'] not found in axis"