In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from scipy.stats import ttest_1samp, chi2_contingency
from statsmodels.stats.weightstats import ztest

In [27]:
from google.colab import drive

In [28]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
# Load the dataset
file_path = "/content/drive/MyDrive/Data_Analysis_using_python/Student_Mental_Stress_and_Coping_Mechanisms.csv"
df = pd.read_csv(file_path)

In [30]:
# Display basic info and first few rows
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Student ID                          760 non-null    object
 1   Age                                 760 non-null    int64 
 2   Gender                              760 non-null    object
 3   Academic Performance (GPA)          760 non-null    int64 
 4   Study Hours Per Week                760 non-null    int64 
 5   Social Media Usage (Hours per day)  760 non-null    int64 
 6   Sleep Duration (Hours per night)    760 non-null    int64 
 7   Physical Exercise (Hours per week)  760 non-null    int64 
 8   Family Support                      760 non-null    int64 
 9   Financial Stress                    760 non-null    int64 
 10  Peer Pressure                       760 non-null    int64 
 11  Relationship Stress                 760 non-null    int64 

In [31]:
df.head()

Unnamed: 0,Student ID,Age,Gender,Academic Performance (GPA),Study Hours Per Week,Social Media Usage (Hours per day),Sleep Duration (Hours per night),Physical Exercise (Hours per week),Family Support,Financial Stress,Peer Pressure,Relationship Stress,Mental Stress Level,Counseling Attendance,Diet Quality,Stress Coping Mechanisms,Cognitive Distortions,Family Mental Health History,Medical Condition,Substance Use
0,802-17-3671,22,Female,2,9,2,12,2,1,1,3,5,9,No,1,Walking or Nature Walks,4,No,Yes,1
1,871-12-8572,25,Female,0,28,0,6,0,1,1,1,2,9,Yes,3,Meditation,2,Yes,No,1
2,495-13-2672,24,Female,0,45,3,12,10,3,3,1,4,9,Yes,5,Reading,1,Yes,Yes,3
3,365-77-2496,20,Male,2,8,7,7,4,1,3,2,5,1,No,1,Social Media Engagement,2,Yes,No,4
4,664-76-5622,28,Male,0,14,6,8,1,2,4,4,2,7,Yes,1,Exercise,1,Yes,No,3


In [32]:
# Drop unnecessary columns
df.drop(columns=["Student ID"], inplace=True)

In [33]:
# Encode categorical variables
categorical_columns = ["Gender", "Counseling Attendance", "Stress Coping Mechanisms",
                        "Family Mental Health History", "Medical Condition"]
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    df[col] = label_encoders[col].fit_transform(df[col])

In [34]:
# Extract features and target variable
X = df.drop(columns=["Mental Stress Level"])
y = df["Mental Stress Level"]

In [35]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [36]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [37]:
# Initialize and train models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1-Score": f1_score(y_test, y_pred, average="weighted")
    }

In [38]:
# Feature Importance from Random Forest
rf_model = models["Random Forest"]
feature_importance = rf_model.feature_importances_
feature_names = X.columns
sorted_features = sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)
top_features = sorted_features[:3]

In [39]:
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [43]:
# One-sample T-test (Hypothesis: Mean = 5)
t_stat, p_value_ttest = ttest_1samp(df["Mental Stress Level"], 5)

In [44]:
# Z-test for study hours based on counseling attendance
group1 = df[df["Counseling Attendance"] == 1]["Study Hours Per Week"]
group2 = df[df["Counseling Attendance"] == 0]["Study Hours Per Week"]
z_stat, p_value_ztest = ztest(group1, group2)

In [45]:
# Chi-square test (Gender vs. Counseling Attendance)
cross_tab = pd.crosstab(df["Gender"], df["Counseling Attendance"])
chi2_stat, p_value_chi2, _, _ = chi2_contingency(cross_tab)

In [46]:
# Print results
print("Model Performance:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")

Model Performance:
Logistic Regression: {'Accuracy': 0.046052631578947366, 'Precision': 0.05072778099093888, 'Recall': 0.046052631578947366, 'F1-Score': 0.04768185973618911}
Random Forest: {'Accuracy': 0.1118421052631579, 'Precision': 0.11969898705096074, 'Recall': 0.1118421052631579, 'F1-Score': 0.11245119740899104}
SVM: {'Accuracy': 0.06578947368421052, 'Precision': 0.05992771978772786, 'Recall': 0.06578947368421052, 'F1-Score': 0.05917014469646048}


In [47]:
print("\nTop 3 Important Features:", top_features)
print("\nT-test p-value:", p_value_ttest)
print("\nZ-test p-value:", p_value_ztest)
print("\nChi-square test p-value:", p_value_chi2)


Top 3 Important Features: [('Study Hours Per Week', np.float64(0.0998295252406086)), ('Age', np.float64(0.07703860353413254)), ('Physical Exercise (Hours per week)', np.float64(0.07570636133360892))]

T-test p-value: 0.0005939960520346076

Z-test p-value: 0.2395012959973023

Chi-square test p-value: 0.25916012793906545


#1. Conclusion from the T-test (Mental Stress Levels)
The T-test compares the mean Mental Stress Level against a hypothesized mean of 5.

If p-value < 0.05, we reject the null hypothesis, meaning the average stress level is significantly different from 5.

If p-value ≥ 0.05, we fail to reject the null hypothesis, meaning there is no significant difference from 5.

Interpretation: Check the p-value from the script output to determine if student stress levels deviate significantly from the hypothesized mean.

#2. Conclusion from the Z-test (Study Hours & Counseling Attendance)
The Z-test checks whether students who attended counseling have significantly different study hours compared to those who did not.

If p-value < 0.05, there is a significant difference in study hours between both groups.

If p-value ≥ 0.05, there is no significant difference in study hours.

Interpretation: Look at the p-value from the script. If it’s below 0.05, students who attended counseling study significantly more or less than those who didn’t.

#3. Conclusion from the Chi-square test (Gender & Counseling Attendance)
The Chi-square test checks if there is a relationship between gender and counseling attendance.

If p-value < 0.05, there is a significant association, meaning gender influences counseling attendance.

If p-value ≥ 0.05, there is no significant association, meaning gender and counseling attendance are independent.

Interpretation: Based on the p-value, determine if gender plays a role in students seeking counseling.