In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif

In [3]:
%matplotlib inline
sns.set(color_codes=True)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
BASE_PATH = '/content/drive/MyDrive/StudentStressLevelMonitoring'
INPUT_CSV = os.path.join(BASE_PATH, 'results/outputs', 'scaled_dataset_with_engineered.csv')
OUTPUTS_DRIVE = os.path.join(BASE_PATH, 'results/outputs')
os.makedirs(OUTPUTS_DRIVE, exist_ok=True)

OUTPUTS_DRIVE_EDA = os.path.join(BASE_PATH, 'results/eda_visualizations')
os.makedirs(OUTPUTS_DRIVE_EDA, exist_ok=True)

TARGET = "stress_level"   # adjust if your label column has a different name
ENGINEERED = ["health_index", "academic_stress", "social_environment"]

In [9]:
df = pd.read_csv(INPUT_CSV)
print("Loaded:", INPUT_CSV, "| Shape:", df.shape)
display(df.head())


Loaded: /content/drive/MyDrive/StudentStressLevelMonitoring/results/outputs/scaled_dataset_with_engineered.csv | Shape: (793, 24)


Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,...,teacher_student_relationship,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying,stress_level,health_index,academic_stress,social_environment
0,0.666667,0.666667,0.0,0.407407,0.4,0.0,0.4,0.8,0.333333,0.666667,...,0.6,0.6,0.666667,0.6,0.6,0.4,1.0,4.5,2.666667,2.333333
1,0.714286,0.266667,1.0,0.555556,1.0,1.0,0.2,0.8,0.666667,0.0,...,0.2,1.0,0.333333,0.8,1.0,1.0,2.0,6.75,3.333333,3.333333
2,0.571429,0.6,1.0,0.518519,0.4,0.0,0.4,0.4,0.333333,0.333333,...,0.6,0.4,0.666667,0.6,0.4,0.4,1.0,4.75,2.333333,2.333333
3,0.761905,0.4,1.0,0.555556,0.8,1.0,0.2,0.6,1.0,0.333333,...,0.2,0.8,0.333333,0.8,0.8,1.0,2.0,6.25,3.333333,3.333333
4,0.761905,0.933333,0.0,0.259259,0.4,1.0,1.0,0.2,0.666667,0.333333,...,0.2,0.4,0.333333,1.0,0.0,1.0,1.0,3.25,3.0,3.666667


In [10]:
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in the dataset.")

engineered_present = [c for c in ENGINEERED if c in df.columns]
original_features = [c for c in df.columns if c not in engineered_present + [TARGET]]

print("\nEngineered features present:", engineered_present)
print("Candidate ORIGINAL features:", len(original_features))


Engineered features present: ['health_index', 'academic_stress', 'social_environment']
Candidate ORIGINAL features: 20


In [11]:
X = df[original_features]
y = df[TARGET]

valid_mask = ~X.isnull().any(axis=1) & ~y.isnull()
X = X[valid_mask]
y = y[valid_mask]
print("After NA filter -> X:", X.shape, "| y:", y.shape)

After NA filter -> X: (793, 20) | y: (793,)


In [12]:
k = min(10, X.shape[1])  # auto-adjust if fewer than 10 features are available
if k == 0:
    raise ValueError("No original features available for selection.")

selector = SelectKBest(score_func=f_classif, k=k)
X_new = selector.fit_transform(X, y)

selected_original = X.columns[selector.get_support()].tolist()
print(f"\nTop {k} selected ORIGINAL features:", selected_original)


Top 10 selected ORIGINAL features: ['self_esteem', 'headache', 'blood_pressure', 'sleep_quality', 'safety', 'basic_needs', 'academic_performance', 'teacher_student_relationship', 'future_career_concerns', 'bullying']


In [13]:
final_cols = engineered_present + selected_original + [TARGET]
df_final = df.loc[valid_mask, final_cols].copy()

print("\nFinal dataset preview:")
display(df_final.head())


Final dataset preview:


Unnamed: 0,health_index,academic_stress,social_environment,self_esteem,headache,blood_pressure,sleep_quality,safety,basic_needs,academic_performance,teacher_student_relationship,future_career_concerns,bullying,stress_level
0,4.5,2.666667,2.333333,0.666667,0.4,0.0,0.4,0.6,0.4,0.6,0.6,0.6,0.4,1.0
1,6.75,3.333333,3.333333,0.266667,1.0,1.0,0.2,0.4,0.4,0.2,0.2,1.0,1.0,2.0
2,4.75,2.333333,2.333333,0.6,0.4,0.0,0.4,0.6,0.4,0.4,0.6,0.4,0.4,1.0
3,6.25,3.333333,3.333333,0.4,0.8,1.0,0.2,0.4,0.4,0.4,0.2,0.8,1.0,2.0
4,3.25,3.0,3.666667,0.933333,0.4,1.0,1.0,0.8,0.6,0.8,0.2,0.4,1.0,1.0


In [14]:
scores_df = pd.DataFrame({
    "feature": X.columns,
    "f_score": selector.scores_
}).dropna()

scores_sel = scores_df[scores_df["feature"].isin(selected_original)].sort_values("f_score", ascending=False)

plt.figure(figsize=(10, max(3, 0.4*len(scores_sel))))
plt.barh(scores_sel["feature"], scores_sel["f_score"])
plt.gca().invert_yaxis()
plt.xlabel("ANOVA F-score")
plt.title("Selected Features — ANOVA F-scores")
plt.tight_layout()
for outdir in [OUTPUTS_DRIVE_EDA]:
    path = os.path.join(outdir, "selected_features_f_scores.png")
    plt.savefig(path, dpi=150, bbox_inches="tight")
    print("Saved:", path)
plt.close()

Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/selected_features_f_scores.png


In [15]:
heat_cols = selected_original + engineered_present + [TARGET]
corr_df = df.loc[valid_mask, heat_cols].copy()

# If target is not numeric, convert to codes for visualization only
if not np.issubdtype(corr_df[TARGET].dtype, np.number):
    corr_df[TARGET] = pd.Categorical(corr_df[TARGET]).codes

corr = corr_df.corr(numeric_only=True)

plt.figure(figsize=(max(8, 0.6*len(corr.columns)), max(6, 0.6*len(corr.columns))))
sns.heatmap(corr, annot=False, cmap="vlag", center=0)
plt.title("Correlation Heatmap — Selected + Engineered + Target")
plt.tight_layout()
for outdir in [OUTPUTS_DRIVE_EDA]:
    path = os.path.join(outdir, "correlation_heatmap.png")
    plt.savefig(path, dpi=150, bbox_inches="tight")
    print("Saved:", path)
plt.close()

Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/correlation_heatmap.png


In [16]:
MAX_BOX_PLOTS = 10
plot_features = selected_original[:MAX_BOX_PLOTS]

for feat in plot_features:
    if feat not in df.columns:
        continue
    plt.figure(figsize=(7, 5))
    try:
        sns.boxplot(x=df.loc[valid_mask, TARGET], y=df.loc[valid_mask, feat])
        plt.xlabel(TARGET)
        plt.ylabel(feat)
        plt.title(f"{feat} vs {TARGET}")
        plt.tight_layout()
        for outdir in [OUTPUTS_DRIVE_EDA]:
            path = os.path.join(outdir, f"box_{feat}_vs_{TARGET}.png")
            plt.savefig(path, dpi=150, bbox_inches="tight")
            print("Saved:", path)
    except Exception as e:
        print(f"[skip] Boxplot failed for {feat}: {e}")
    plt.close()

Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_self_esteem_vs_stress_level.png
Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_headache_vs_stress_level.png
Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_blood_pressure_vs_stress_level.png
Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_sleep_quality_vs_stress_level.png
Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_safety_vs_stress_level.png
Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_basic_needs_vs_stress_level.png
Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_academic_performance_vs_stress_level.png
Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_teacher_student_relationship_vs_stress_level.png
S

In [17]:
FINAL_CSV_DRIVE = os.path.join(OUTPUTS_DRIVE, "final_selected_features_dataset.csv")
df_final.to_csv(FINAL_CSV_DRIVE, index=False)
print("\nSaved:")
print(" -", FINAL_CSV_DRIVE)



Saved:
 - /content/drive/MyDrive/StudentStressLevelMonitoring/results/outputs/final_selected_features_dataset.csv
