In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(color_codes=True)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
BASE_PATH = '/content/drive/MyDrive/StudentStressLevelMonitoring'
INPUT_CSV = os.path.join(BASE_PATH, 'results/outputs', 'cleaned_dropna_no_duplicates.csv')

OUTPUTS_DRIVE = os.path.join(BASE_PATH, 'results/outputs')
os.makedirs(OUTPUTS_DRIVE, exist_ok=True)

OUTPUTS_DRIVE_EDA = os.path.join(BASE_PATH, 'results/eda_visualizations')
os.makedirs(OUTPUTS_DRIVE, exist_ok=True)

In [12]:
df = pd.read_csv(INPUT_CSV)

print("Loaded:", INPUT_CSV)
print("Shape:", df.shape)


Loaded: /content/drive/MyDrive/StudentStressLevelMonitoring/results/outputs/cleaned_dropna_no_duplicates.csv
Shape: (1100, 21)


In [17]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if not numeric_cols:
    print("No numeric columns found. Skipping outlier removal.")
    # Still save a passthrough dataset and a small report
    passthrough_path_drive = os.path.join(OUTPUTS_DRIVE, 'dataset_no_outliers.csv')
    df.to_csv(passthrough_path_drive, index=False)
    with open(os.path.join(OUTPUTS_DRIVE, "outlier_report.txt"), "w") as f:
        f.write("No numeric columns; no outlier removal performed.\n")
else:
  def save_boxplot(dataframe, cols, title, save_path):
        plt.figure(figsize=(max(12, len(cols)*0.7), 6))
        dataframe[cols].boxplot()
        plt.xticks(rotation=90)
        plt.title(title)
        plt.tight_layout()
        plt.savefig(save_path, dpi=150, bbox_inches="tight")
        plt.close()
        print(f"Saved: {save_path}")


In [19]:
 save_boxplot(
        df, numeric_cols,
        "Numeric Columns — Before Outlier Removal (IQR)",
        os.path.join(OUTPUTS_DRIVE_EDA, "box_before_outliers.png")
    )

Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_before_outliers.png


In [22]:
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# Columns with non-zero IQR (only these will be used for outlier filtering)
usable_cols = IQR[IQR > 0].index.tolist()

if not usable_cols:
        print("All numeric columns have zero IQR (constant values). Skipping outlier removal.")
        df_cleaned = df.copy()
        outlier_rows_removed = 0
else:
        lower_bounds = Q1[usable_cols] - 1.5 * IQR[usable_cols]
        upper_bounds = Q3[usable_cols] + 1.5 * IQR[usable_cols]

        # Build a boolean mask for rows **within** bounds across usable columns
        within_lower = df[usable_cols].ge(lower_bounds, axis=1)
        within_upper = df[usable_cols].le(upper_bounds, axis=1)
        within_all = within_lower & within_upper

        # Keep rows that are within bounds for **all** usable numeric columns
        condition = within_all.all(axis=1)

        df_cleaned = df[condition].copy()
        outlier_rows_removed = len(df) - len(df_cleaned)

print("Original shape:", df.shape)
print("After removing outliers:", df_cleaned.shape)
print(f"Rows removed as outliers: {outlier_rows_removed}")


Original shape: (1100, 21)
After removing outliers: (793, 21)
Rows removed as outliers: 307


In [24]:
save_boxplot(
        df_cleaned, numeric_cols,
        "Numeric Columns — After Outlier Removal (IQR)",
        os.path.join(OUTPUTS_DRIVE_EDA, "box_after_outliers.png")
    )

Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/box_after_outliers.png


In [25]:
out_csv_drive = os.path.join(OUTPUTS_DRIVE, 'dataset_no_outliers.csv')
df_cleaned.to_csv(out_csv_drive, index=False)
print("Saved no-outlier CSVs to:")
print(" -", out_csv_drive)

Saved no-outlier CSVs to:
 - /content/drive/MyDrive/StudentStressLevelMonitoring/results/outputs/dataset_no_outliers.csv
