In [1]:
import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    confusion_matrix
)
from sklearn.linear_model import LogisticRegression

In [3]:
import joblib

%matplotlib inline
sns.set(color_codes=True)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
BASE_PATH   = '/content/drive/MyDrive/StudentStressLevelMonitoring'
INPUT_CSV   = os.path.join(BASE_PATH, 'results/outputs', 'final_selected_features_dataset.csv')
TARGET      = 'stress_level'
TEST_SIZE   = 0.2
RANDOM_SEED = 42

In [6]:
K_TOP       = 10
N_COMPONENTS= 5

In [7]:
OUTPUTS_DRIVE = os.path.join(BASE_PATH, 'results/outputs')
os.makedirs(OUTPUTS_DRIVE, exist_ok=True)

OUTPUTS_DRIVE_EDA = os.path.join(BASE_PATH, 'results/eda_visualizations')
os.makedirs(OUTPUTS_DRIVE_EDA, exist_ok=True)


In [8]:
df = pd.read_csv(INPUT_CSV)
print("Loaded:", INPUT_CSV, "| Shape:", df.shape)
display(df.head())

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in dataset.")

Loaded: /content/drive/MyDrive/StudentStressLevelMonitoring/results/outputs/final_selected_features_dataset.csv | Shape: (793, 14)


Unnamed: 0,health_index,academic_stress,social_environment,self_esteem,headache,blood_pressure,sleep_quality,safety,basic_needs,academic_performance,teacher_student_relationship,future_career_concerns,bullying,stress_level
0,4.5,2.666667,2.333333,0.666667,0.4,0.0,0.4,0.6,0.4,0.6,0.6,0.6,0.4,1.0
1,6.75,3.333333,3.333333,0.266667,1.0,1.0,0.2,0.4,0.4,0.2,0.2,1.0,1.0,2.0
2,4.75,2.333333,2.333333,0.6,0.4,0.0,0.4,0.6,0.4,0.4,0.6,0.4,0.4,1.0
3,6.25,3.333333,3.333333,0.4,0.8,1.0,0.2,0.4,0.4,0.4,0.2,0.8,1.0,2.0
4,3.25,3.0,3.666667,0.933333,0.4,1.0,1.0,0.8,0.6,0.8,0.2,0.4,1.0,1.0


In [9]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c != TARGET]

if not feature_cols:
    raise ValueError("No numeric feature columns available for training.")

X = df[feature_cols].copy()
y = df[TARGET].copy()

# Remove rows with missing y
mask = ~y.isnull()
X, y = X[mask], y[mask]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)
print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (634, 13) Test: (159, 13)


In [19]:
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler()),
    ("select", SelectKBest(score_func=f_classif, k=min(K_TOP, X_train.shape[1]))),
    ("pca", PCA(n_components=min(N_COMPONENTS, X_train.shape[1]))),
    ("clf", LogisticRegression(max_iter=2000, n_jobs=None))
])

In [20]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
cv_acc = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy")
cv_f1  = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1_macro")

print(f"CV Accuracy: mean={cv_acc.mean():.4f} ± {cv_acc.std():.4f}")
print(f"CV F1-macro: mean={cv_f1.mean():.4f} ± {cv_f1.std():.4f}")

CV Accuracy: mean=0.9448 ± 0.0086
CV F1-macro: mean=0.9359 ± 0.0137


In [21]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1m = f1_score(y_test, y_pred, average="macro")
print(f"\nTEST Accuracy: {acc:.4f}")
print(f"TEST F1-macro: {f1m:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


TEST Accuracy: 0.9748
TEST F1-macro: 0.9711

Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.97      0.97        64
         1.0       1.00      0.98      0.99        64
         2.0       0.94      0.97      0.95        31

    accuracy                           0.97       159
   macro avg       0.97      0.97      0.97       159
weighted avg       0.98      0.97      0.97       159



In [22]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix (Test)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
for outdir in [OUTPUTS_DRIVE_EDA]:
    p = os.path.join(outdir, "confusion_matrix.png")
    plt.savefig(p, dpi=150, bbox_inches="tight")
    print("Saved:", p)
plt.close()

Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/confusion_matrix.png


In [23]:
MODEL_PATH_DRIVE = os.path.join(OUTPUTS_DRIVE, "trained_pipeline.joblib")
joblib.dump(pipe, MODEL_PATH_DRIVE)
print("\nSaved trained pipeline to:")
print(" -", MODEL_PATH_DRIVE)


Saved trained pipeline to:
 - /content/drive/MyDrive/StudentStressLevelMonitoring/results/outputs/trained_pipeline.joblib
