In [1]:
import pandas as pd

# Load the dataset
file_path = "/content/Student_Mental_Stress_and_Coping_Mechanisms.csv"
df = pd.read_csv(file_path)

# Display basic information
print(df.info())

# Show the first few rows
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Student ID                          760 non-null    object
 1   Age                                 760 non-null    int64 
 2   Gender                              760 non-null    object
 3   Academic Performance (GPA)          760 non-null    int64 
 4   Study Hours Per Week                760 non-null    int64 
 5   Social Media Usage (Hours per day)  760 non-null    int64 
 6   Sleep Duration (Hours per night)    760 non-null    int64 
 7   Physical Exercise (Hours per week)  760 non-null    int64 
 8   Family Support                      760 non-null    int64 
 9   Financial Stress                    760 non-null    int64 
 10  Peer Pressure                       760 non-null    int64 
 11  Relationship Stress                 760 non-null    int64 

In [12]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_cols = ['Gender', 'Counseling Attendance', 'Stress Coping Mechanisms',
                    'Family Mental Health History', 'Medical Condition']

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert categories to numbers
    label_encoders[col] = le

# Now split dataset again
X = df.drop(columns=['Mental Stress Level'])  # Features
y = df['Mental Stress Level']  # Target variable

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Check unique values in the target variable
print("Unique values in target (Mental Stress Level):", y_train.nunique())

# If it's multi-class, set the right parameters for Logistic Regression
if y_train.nunique() > 2:
    log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)
else:
    log_reg = LogisticRegression()

# Train Logistic Regression
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# Train Support Vector Machine (SVM)
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)

# Model Evaluation Function
def evaluate_model(y_test, y_pred, model_name):
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

# Evaluate each model
evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_svm, "SVM")



Unique values in target (Mental Stress Level): 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Performance:
Accuracy: 0.0526
Precision: 0.0519
Recall: 0.0526
F1 Score: 0.0516

Random Forest Performance:
Accuracy: 0.0987
Precision: 0.1090
Recall: 0.0987
F1 Score: 0.1005

SVM Performance:
Accuracy: 0.1118
Precision: 0.0242
Recall: 0.1118
F1 Score: 0.0398


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
import numpy as np

# Get feature importances from Random Forest
feature_importances = rf_clf.feature_importances_
features = X.columns
sorted_indices = np.argsort(feature_importances)[::-1]

# Display top 3 features
print("\nTop 3 Important Features:")
for i in range(3):
    print(f"{features[sorted_indices[i]]}: {feature_importances[sorted_indices[i]]:.4f}")



Top 3 Important Features:
Student ID: 0.0909
Study Hours Per Week: 0.0908
Age: 0.0695


In [14]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Explained variance
print("\nExplained Variance Ratio:", pca.explained_variance_ratio_)



Explained Variance Ratio: [0.99196641 0.00669523]


In [15]:
from scipy.stats import ttest_1samp

# Hypothesis: Mean Mental Stress Level is 5
t_stat, p_value = ttest_1samp(y, 5)

print("\nT-test Results:")
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Reject the null hypothesis: The mean stress level is significantly different from 5.")
else:
    print("Fail to reject the null hypothesis: No significant difference from 5.")



T-test Results:
T-statistic: 3.4489, P-value: 0.0006
Reject the null hypothesis: The mean stress level is significantly different from 5.


In [16]:
from statsmodels.stats.weightstats import ztest

# Compare study hours of students who attended vs. did not attend counseling
attended = df[df['Counseling Attendance'] == 1]['Study Hours Per Week']
not_attended = df[df['Counseling Attendance'] == 0]['Study Hours Per Week']

z_stat, p_val = ztest(attended, not_attended)

print("\nZ-test Results:")
print(f"Z-statistic: {z_stat:.4f}, P-value: {p_val:.4f}")

if p_val < 0.05:
    print("Significant difference in study hours between students who attended and did not attend counseling.")
else:
    print("No significant difference in study hours.")



Z-test Results:
Z-statistic: 1.1762, P-value: 0.2395
No significant difference in study hours.


In [17]:
from scipy.stats import chi2_contingency

# Create contingency table
contingency_table = pd.crosstab(df['Gender'], df['Counseling Attendance'])

# Perform Chi-Square test
chi2_stat, p_chi, dof, expected = chi2_contingency(contingency_table)

print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2_stat:.4f}, P-value: {p_chi:.4f}")

if p_chi < 0.05:
    print("There is a significant relationship between gender and counseling attendance.")
else:
    print("No significant relationship between gender and counseling attendance.")



Chi-Square Test Results:
Chi-Square Statistic: 8.9103, P-value: 0.2592
No significant relationship between gender and counseling attendance.
