In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
# Load datasets
pre_df = pd.read_csv("PRE_DEPARTURE_STUDENTS_MENTAL_HEALTH_SURVEY_EXTENDED.csv")
post_df = pd.read_csv("POST-ARRIVAL_STUDENT_MENTAL_HEALTH_SURVEY_EXTENDED.csv")


In [5]:
def preprocess_dataset(df):
    df = df.copy()

    # Remove Timestamp
    if 'Timestamp' in df.columns:
        df = df.drop(columns=['Timestamp'])

    # Remove duplicates
    df = df.drop_duplicates()

    categorical_cols = []
    binary_cols = []
    numerical_cols = []

    for col in df.columns:
        unique_vals = df[col].dropna().unique()

        if set(unique_vals).issubset({'Yes', 'No', 'yes', 'no'}):
            binary_cols.append(col)
        elif df[col].dtype == 'object':
            categorical_cols.append(col)
        else:
            numerical_cols.append(col)

    # Handle missing values
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())

    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])

    for col in binary_cols:
        df[col] = df[col].fillna(df[col].mode()[0])

    # Encode binary
    for col in binary_cols:
        df[col] = df[col].map({'Yes': 1, 'No': 0, 'yes': 1, 'no': 0})

    # One-hot encoding
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # Scaling
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    return df


In [6]:
preprocessed_pre_df = preprocess_dataset(pre_df)
preprocessed_post_df = preprocess_dataset(post_df)


In [7]:
preprocessed_pre_df.to_csv("preprocessed_pre_departure.csv", index=False)
preprocessed_post_df.to_csv("preprocessed_post_arrival.csv", index=False)

print("✅ Preprocessing completed and files saved successfully.")


✅ Preprocessing completed and files saved successfully.
