In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer


In [17]:
df = pd.read_csv("POST-ARRIVAL_STUDENT_MENTAL_HEALTH_SURVEY_EXTENDED.csv")

print(df.shape)
df.head()


(148, 31)


Unnamed: 0,Timestamp,1. Age Group,2. Gender,3.Academic Level,4. Country you are currently studying in.,5. How often do you feel overwhelmed by your responsibilities abroad?,6.How often do you feel nervous or on edge since arriving abroad?,7. How difficult is it for you to relax after your daily activities while living abroad?,8. How often do you feel low energy or lack motivation due to stress?,9. How often do you struggle with concentration because of mental pressure?,...,21. How stressed are you about managing living expenses abroad?,22. Do financial difficulties affect your emotional well-being?,23. How anxious are you about long-term visa or immigration matters?,24. How confident are you in finding stable work or financial support abroad?,25. Do you feel emotionally supported by your family while living abroad?,26. How often do you feel homesick?,27. How comfortable are you sharing emotional struggles with family or friends?,28. Do you get sufficient sleep (7–8 hours)?,29. How often do you engage in physical activity or exercise?,"30. Do you practice relaxation activities (prayer, meditation, breathing exercises)?"
0,2025/12/08 11:15:55 PM GMT+6,22-26,Male,Masters,USA,2.0,4.0,Yes,4.0,4.0,...,4.0,Yes,5.0,2.0,Yes,2.0,4.0,No,1.0,No
1,2025/12/15 10:01:13 PM GMT+6,18-22,Male,Masters,Australia,1.0,2.0,1,1.0,2.0,...,3.0,Yes,2.0,4.0,Yes,4.0,4.0,No,4.0,Yes
2,2025/12/15 10:03:03 PM GMT+6,26-30,Female,Masters,usa,4.0,4.0,3,4.0,4.0,...,4.0,Yes,4.0,2.0,No,4.0,4.0,No,4.0,No
3,2025/12/15 10:06:44 PM GMT+6,26-30,Female,Masters,Australia,4.0,4.0,4,3.0,4.0,...,4.0,Yes,4.0,3.0,Yes,4.0,4.0,No,4.0,Yes
4,2025/12/15 10:06:50 PM GMT+6,18-22,Male,Masters,Sweeden,2.0,2.0,2,2.0,4.0,...,3.0,Yes,3.0,4.0,Yes,3.0,4.0,Yes,4.0,Yes


In [18]:
df = df.drop(df.index[149:175])


In [19]:
depression_cols = [
    "5. How often do you feel overwhelmed by academic workload after arrival?",
    "6. How often do you feel nervous or anxious in the new environment?",
    "8. How often do you worry about coping with academic pressure?",
    "9. How often do you feel low energy or lack motivation after arrival?",
    "11. How anxious are you about financial or living expenses?",
    "12. How often do you overthink problems related to studies or生活 abroad?",
    "14. How stressed do you feel about cultural adjustment?",
    "20. How often do you feel pressure to perform well academically?",
    "22. How concerned are you about social isolation after arrival?",
    "24. Do you get sufficient sleep (7_8_ hours) after arrival?",
    "30. How stressed are you about managing studies and personal life?"
]


In [20]:
# -------------------------------
# AUTO-DETECT + FIX DEPRESSION COLUMNS (POST-ARRIVAL)
# -------------------------------

# keywords that identify depression-related questions
keywords = [
    "overwhelmed",
    "nervous",
    "anxious",
    "worry",
    "low energy",
    "motivation",
    "stressed",
    "pressure",
    "isolated",
    "sleep"
]

# auto-select matching columns
depression_cols = [
    col for col in df.columns
    if any(k.lower() in col.lower() for k in keywords)
]

print("Detected Depression Columns:")
for c in depression_cols:
    print(c)

# convert to numeric
df[depression_cols] = df[depression_cols].apply(
    pd.to_numeric, errors="coerce"
)

# reverse-code sleep (if present)
sleep_cols = [c for c in depression_cols if "sleep" in c.lower()]
for c in sleep_cols:
    df[c] = 6 - df[c]

# fill missing values
for col in depression_cols:
    df[col].fillna(df[col].median(), inplace=True)

# create depression score
df["Depression_Score"] = df[depression_cols].mean(axis=1)

# create multiclass label
def depression_level(score):
    if score <= 2.0:
        return "No/Minimal"
    elif score <= 3.0:
        return "Mild"
    elif score <= 4.0:
        return "Moderate"
    else:
        return "Severe"

df["Depression_Level"] = df["Depression_Score"].apply(depression_level)

print("\nDepression Level Distribution:")
print(df["Depression_Level"].value_counts())



Detected Depression Columns:
  5. How often do you feel overwhelmed by your responsibilities abroad?  
6.How often do you feel nervous or on edge since arriving abroad?  
  8. How often do you feel low energy or lack motivation due to stress?   
 9. How often do you struggle with concentration because of mental pressure?  
 12. How often do you feel socially isolated or lonely abroad?  
  17. Do you feel pressure to achieve high academic results?  
  19. How often do you feel stressed before exams or deadlines?   
  21. How stressed are you about managing living expenses abroad?  
  23. How anxious are you about long-term visa or immigration matters?  
  28. Do you get sufficient sleep (7–8 hours)?  

Depression Level Distribution:
Depression_Level
Moderate      92
No/Minimal    31
Mild          25
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [21]:
X = df.drop(columns=["Depression_Level", "Depression_Score"])
y = df["Depression_Level"]


In [22]:
# Drop target and helper column
X = df.drop(columns=["Depression_Level", "Depression_Score"])
y = df["Depression_Level"]

print(X.shape, y.shape)


(148, 31) (148,)


In [24]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

print("Categorical features:", len(categorical_cols))
print("Numeric features:", len(numeric_cols))


Categorical features: 10
Numeric features: 21


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)
