In [9]:
import pandas as pd
import numpy as np

# Load the Excel file
file_path = "Student Mental health.csv.xlsx"
df = pd.read_excel(file_path)

# Show the structure and first few rows
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   Timestamp                                     101 non-null    datetime64[ns]
 1   Choose your gender                            101 non-null    object        
 2   Age                                           100 non-null    float64       
 3   What is your course?                          101 non-null    object        
 4   Your current year of Study                    101 non-null    object        
 5   What is your CGPA?                            101 non-null    object        
 6   Marital status                                101 non-null    object        
 7   Do you have Depression?                       101 non-null    object        
 8   Do you have Anxiety?                          101 non-null    object  

(None,
             Timestamp Choose your gender   Age What is your course?  \
 0 2020-07-08 12:02:00             Female  18.0          Engineering   
 1 2020-07-08 12:04:00               Male  21.0    Islamic education   
 2 2020-07-08 12:05:00               Male  19.0                  BIT   
 3 2020-07-08 12:06:00             Female  22.0                 Laws   
 4 2020-07-08 12:13:00               Male  23.0         Mathemathics   
 
   Your current year of Study What is your CGPA? Marital status  \
 0                     year 1        3.00 - 3.49             No   
 1                     year 2        3.00 - 3.49             No   
 2                     Year 1        3.00 - 3.49             No   
 3                     year 3        3.00 - 3.49            Yes   
 4                     year 4        3.00 - 3.49             No   
 
   Do you have Depression? Do you have Anxiety? Do you have Panic attack?  \
 0                     Yes                   No                       Yes   
 

In [5]:
df.info()
df.describe(include='all')
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   Timestamp                                     101 non-null    datetime64[ns]
 1   Choose your gender                            101 non-null    object        
 2   Age                                           100 non-null    float64       
 3   What is your course?                          101 non-null    object        
 4   Your current year of Study                    101 non-null    object        
 5   What is your CGPA?                            101 non-null    object        
 6   Marital status                                101 non-null    object        
 7   Do you have Depression?                       101 non-null    object        
 8   Do you have Anxiety?                          101 non-null    object  

Timestamp                                       0
Choose your gender                              0
Age                                             1
What is your course?                            0
Your current year of Study                      0
What is your CGPA?                              0
Marital status                                  0
Do you have Depression?                         0
Do you have Anxiety?                            0
Do you have Panic attack?                       0
Did you seek any specialist for a treatment?    0
dtype: int64

In [17]:
df.replace(['', ' ', '-', 'N/A', None], np.nan, inplace=True)

In [23]:
df = df.dropna()

In [25]:
df.isnull().sum()

Timestamp                                       0
Choose your gender                              0
Age                                             0
What is your course?                            0
Your current year of Study                      0
What is your CGPA?                              0
Marital status                                  0
Do you have Depression?                         0
Do you have Anxiety?                            0
Do you have Panic attack?                       0
Did you seek any specialist for a treatment?    0
dtype: int64

In [34]:
num_rows = df.shape[0]
print(f"Total rows: {num_rows}")
num_rows = df.shape[1]
print(f"Total columns: {num_rows}")

Total rows: 100
Total columns: 11


## At risk evaluation

In [37]:
# Convert text to binary (1 = Yes, 0 = No)
df['Depression'] = df['Do you have Depression?'].map({'Yes': 1, 'No': 0})
df['Anxiety'] = df['Do you have Anxiety?'].map({'Yes': 1, 'No': 0})
df['Panic'] = df['Do you have Panic attack?'].map({'Yes': 1, 'No': 0})
df['Sought_Treatment'] = df['Did you seek any specialist for a treatment?'].map({'Yes': 1, 'No': 0})

# Define risk label: has mental health condition(s) but did not seek help
df['Risk_Label'] = (
    ((df['Depression'] == 1) | (df['Anxiety'] == 1) | (df['Panic'] == 1)) &
    (df['Sought_Treatment'] == 0)
).astype(int)

# Check the distribution
df['Risk_Label'].value_counts()


Risk_Label
1    58
0    42
Name: count, dtype: int64

In [39]:
# Step 1: Prepare dataset (after Risk_Label creation)
features = ['Depression', 'Anxiety', 'Panic', 'Sought_Treatment']
X = df[features]
y = df['Risk_Label']

# Step 2: Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Model Training
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 4: Prediction & Evaluation
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("🔍 Classification Report:\n")
print(classification_report(y_test, y_pred))

print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


🔍 Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00         9

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

📊 Confusion Matrix:
[[11  0]
 [ 0  9]]


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Train Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

# Step 2: Predict
y_pred_log = log_model.predict(X_test)

# Step 3: Evaluate
print("🔍 Logistic Regression - Classification Report:\n")
print(classification_report(y_test, y_pred_log))

print("📊 Logistic Regression - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log))


🔍 Logistic Regression - Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.73      0.84        11
           1       0.75      1.00      0.86         9

    accuracy                           0.85        20
   macro avg       0.88      0.86      0.85        20
weighted avg       0.89      0.85      0.85        20

📊 Logistic Regression - Confusion Matrix:
[[8 3]
 [0 9]]


In [45]:
import joblib

# Save the logistic regression model
joblib.dump(log_model, 'suicide_risk_model.pkl')


['suicide_risk_model.pkl']