In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier


In [2]:
df = pd.read_csv("PRE_DEPARTURE_STUDENTS_MENTAL_HEALTH_SURVEY_EXTENDED.csv")

print(df.shape)
df.head()


(302, 31)


Unnamed: 0,Timestamp,1. Age Group,2. Gender,3. Academic Level,Destination Country,5. How often do you feel overwhelmed by your responsibilities during pre departure preparation?,6. How often do you feel nervous or on edge about going abroad?,7. Do you find it difficult to relax when thinking about your upcoming move?,8. How often do you worry about your ability to cope with challenges abroad?,9. How often do you feel low energy or lack motivation due to pre_departure stress?,...,21. How comfortable are you discussing your worries with friends or family?,22. How concerned are you about becoming socially isolated abroad?,23. How confident are you about making new friends abroad?,24. Do you get sufficient sleep (7_8_ hours) during the preparation period?,25. How often do you engage in physical activity or exercise?,"26. Do you use relaxation techniques (prayer, meditation, deep breathing)?","27. How prepared do you feel to live independently abroad (cooking, budgeting, managing tasks)?","28. How confident are you about handling unexpected issues abroad (illness, document problems, emergencies)?",29. How strongly do you believe you can manage homesickness after moving abroad?,30. How stressed are you about leaving your family and friends?
0,2025/12/08 11:05:08 PM GMT+6,23–26,Male,Undergraduate,USA,3.0,3.0,No,4.0,5.0,...,2.0,3.0,2.0,No,1.0,No,3.0,2.0,3.0,4.0
1,2025/12/08 11:38:12 PM GMT+6,23–27,Male,Undergraduate,Cypras,3.0,3.0,Yes,2.0,3.0,...,3.0,3.0,2.0,No,2.0,No,2.0,3.0,3.0,2.0
2,2025/12/08 11:41:23 PM GMT+6,23–28,Male,Undergraduate,Australia,3.0,3.0,Yes,3.0,4.0,...,,,,,,,,,,
3,2025/12/08 11:43:46 PM GMT+6,23–29,Male,Undergraduate,Anywhere with a good research program,3.0,1.0,No,1.0,1.0,...,5.0,1.0,5.0,No,4.0,Yes,5.0,5.0,3.0,2.0
4,2025/12/08 11:49:38 PM GMT+6,23–30,Male,Undergraduate,USA,4.0,4.0,Yes,1.0,4.0,...,4.0,3.0,4.0,yes,4.0,Yes,3.0,4.0,4.0,3.0


In [13]:
depression_cols = [
    "5. How often do you feel overwhelmed by your responsibilities during pre departure preparation?",
    "6. How often do you feel nervous or on edge about going abroad?",
    "8. How often do you worry about your ability to cope with challenges abroad?",
    "9. How often do you feel low energy or lack motivation due to pre_departure stress?",
    "11. How anxious are you about visa delays or possible rejection?",
    "12. How often do you overthink potential problems that might happen abroad?",
    "14. How stressed do you feel about adapting to a new cultural and academic environment?",
    "20. How often do you feel pressured by your family expectations?",
    "22. How concerned are you about becoming socially isolated abroad?",
    "24. Do you get sufficient sleep (7_8_ hours) during the preparation period?",
    "30. How stressed are you about leaving your family and friends?"
]


In [14]:
df[depression_cols] = df[depression_cols].apply(
    pd.to_numeric, errors="coerce"
)


In [16]:

sleep_col = "24. Do you get sufficient sleep (7_8_ hours) during the preparation period?"
df[sleep_col] = 6 - df[sleep_col]


In [17]:
for col in depression_cols:
    df[col].fillna(df[col].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [18]:
df["Depression_Score"] = df[depression_cols].mean(axis=1)

df["Depression_Score"].describe()


count    302.000000
mean       3.140397
std        0.706324
min        1.500000
25%        2.525000
50%        3.300000
75%        3.700000
max        4.200000
Name: Depression_Score, dtype: float64

In [19]:
def depression_level(score):
    if score <= 2.0:
        return "No/Minimal"
    elif score <= 3.0:
        return "Mild"
    elif score <= 4.0:
        return "Moderate"
    else:
        return "Severe"

df["Depression_Level"] = df["Depression_Score"].apply(depression_level)

df["Depression_Level"].value_counts()


Depression_Level
Moderate      156
Mild           99
Severe         25
No/Minimal     22
Name: count, dtype: int64

In [20]:
df_model = df.drop(columns=depression_cols + ["Depression_Score"])


In [21]:
df_model = df.drop(columns=depression_cols + ["Depression_Score"])


In [22]:
X = df_model.drop(columns=["Depression_Level"])
y = df_model["Depression_Level"]


In [23]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns


In [24]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [26]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        class_weight="balanced"
    ))
])


In [27]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        class_weight="balanced"
    ))
])
