In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, confusion_matrix, ConfusionMatrixDisplay,
    mean_squared_error, mean_absolute_error, precision_score,
    recall_score, f1_score
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load the dataset
file_path = "Student Depression Dataset.csv"
data = pd.read_csv(file_path)

# Dataset Overview
print("Dataset Overview")
print(data.info())
print("\nFirst 5 Rows of the Data:")
print(data.head())
print("\nSummary Statistics:")
print(data.describe())

# Data Preprocessing
# Fill missing values
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

categorical_columns = data.select_dtypes(include=['object']).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Encode categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

# Data Visualization
# Depression Distribution
plt.figure(figsize=(10, 6))
sns.countplot(x="Depression", data=data, hue='Depression', palette='coolwarm')
plt.title("Depression Distribution", fontsize=16)
plt.xlabel("Depression (0: No, 1: Yes)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.show()

# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap", fontsize=16)
plt.show()

# Feature Selection
data.drop(['id', 'City', 'Profession'], axis=1, inplace=True)

# Prepare data for modeling
X = data.drop(columns="Depression")
y = data["Depression"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training and Evaluation
models = {
    "Support Vector Machines": SVC(probability=True, kernel='rbf', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(weights='distance', metric='euclidean'),
    "Logistic Regression": LogisticRegression(random_state=42, solver='liblinear'),
    "Random Forest": RandomForestClassifier(random_state=42),
}

for name, model in models.items():
    print(f"\n{name}")

    # Fit the model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluation Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

    # Error metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")

    # Confusion matrix
    disp = ConfusionMatrixDisplay.from_estimator(
        model, X_test, y_test, cmap='Blues'
    )
    disp.ax_.set_title(f"Confusion Matrix: {name}")
    plt.show()

# Unsupervised Learning - KMeans Clustering
X_scaled = scaler.fit_transform(X)

# Elbow Method to find optimal number of clusters
inertia = []
range_n_clusters = list(range(2, 11))

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(8, 6))
plt.plot(range_n_clusters, inertia, marker='o', linestyle='-', color='b')
plt.title("Elbow Method: Optimal Number of Clusters")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.xticks(range_n_clusters)
plt.grid(True)
plt.show()

# KMeans Clustering with optimal k
optimal_k = 3  # Determined from elbow plot
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels_optimal = kmeans_optimal.fit_predict(X_scaled)

# Visualize clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame with PCA components and cluster labels
X_pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
X_pca_df['Cluster'] = cluster_labels_optimal

# Plot the clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x=X_pca_df['PCA1'], y=X_pca_df['PCA2'],
    hue=X_pca_df['Cluster'], palette="Set1",
    s=100, edgecolor='black'
)
plt.title(f"KMeans Clustering with {optimal_k} Clusters (PCA)", fontsize=16)
plt.xlabel('PCA Component 1', fontsize=12)
plt.ylabel('PCA Component 2', fontsize=12)
plt.show()