In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Features and target
X = df_pca.drop(columns=["Depression"], errors="ignore")
y = df_pca["Depression"]

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print("\n--- Regression Evaluation ---")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
# -------------------------------
# STEP: Regression Visualization
# -------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Scatter plot: Actual vs Predicted
plt.figure(figsize=(7,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
         'r--', lw=2)  # ideal line
plt.title("Actual vs Predicted Depression Scores")
plt.xlabel("Actual Depression")
plt.ylabel("Predicted Depression")
plt.show()

# 2. Residuals plot
residuals = y_test - y_pred
plt.figure(figsize=(7,6))
sns.scatterplot(x=y_pred, y=residuals, alpha=0.7)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residuals vs Predicted Values")
plt.xlabel("Predicted Depression")
plt.ylabel("Residuals")
plt.show()

# 3. Distribution of residuals
plt.figure(figsize=(7,6))
sns.histplot(residuals, kde=True, color="purple", bins=20)
plt.title("Distribution of Residuals")
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.show()
