In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif

# Load the dataset
data = pd.read_csv("student_mental_health_1.csv")

# Data Preprocessing

# Feature Engineering

# 1. Create Age Groups based on Age
data['Age'] = pd.cut(data['Age'], bins=[0, 19, 29, 39, float('inf')],
                          labels=['Teen', 'Young Adult', 'Adult', 'Senior'])

# 2. Categorize Credit Load into Low, Medium, and High
data['Semester_Credit_Load'] = pd.cut(data['Semester_Credit_Load'], bins=[15, 20, 25, float('inf')],
                                      labels=['Low', 'Medium', 'High'])

# 3. Categorize CGPA into Low, Medium, High, and Excellent
data['CGPA'] = pd.cut(data['CGPA'], bins=[0, 2.7, 3, 3.5, float('inf')],
                              labels=['Low', 'Medium', 'High', 'Excellent'])

# Handle Missing Data

# Handle missing values in numerical columns by filling with the mean
numerical_columns = ['Age', 'CGPA', 'Stress_Level', 'Depression_Score', 'Anxiety_Score', 'Financial_Stress', 'Semester_Credit_Load']
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

# Handle missing values in categorical columns by filling with the mode (most frequent value)
categorical_columns = [
    'Course', 'Gender', 'Sleep_Quality', 'Physical_Activity', 'Diet_Quality', 'Social_Support',
    'Relationship_Status', 'Substance_Use', 'Counseling_Service_Use', 'Family_History',
    'Chronic_Illness', 'Extracurricular_Involvement', 'Residence_Type',
    'Age', 'Semester_Credit_Load', 'CGPA']

for col in categorical_columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Encode Categorical Data

from sklearn.preprocessing import LabelEncoder


Age = LabelEncoder()
data['Age'] = Age.fit_transform(data['Age'])
data["Age"].unique()

Semester_Credit_Load = LabelEncoder()
data['Semester_Credit_Load'] = Semester_Credit_Load.fit_transform(data['Semester_Credit_Load'])
data["Semester_Credit_Load"].unique()

#print(data["CGPA"].unique())
CGPA = LabelEncoder()
data['CGPA'] = CGPA.fit_transform(data['CGPA'])
data["CGPA"].unique()
#print(data["CGPA"].unique())

Course = LabelEncoder()
data['Course'] = Course.fit_transform(data['Course'])
data["Course"].unique()


Gender = LabelEncoder()
data['Gender'] =  Gender.fit_transform(data['Gender'])
data["Gender"].unique()

Sleep_Quality = LabelEncoder()
data['Sleep_Quality'] = Sleep_Quality.fit_transform(data['Sleep_Quality'])
data["Sleep_Quality"].unique()

Physical_Activity= LabelEncoder()
data['Physical_Activity'] = Physical_Activity.fit_transform(data['Physical_Activity'])
data["Physical_Activity"].unique()

Diet_Quality= LabelEncoder()
data['Diet_Quality'] = Diet_Quality.fit_transform(data['Diet_Quality'])
data["Diet_Quality"].unique()

Social_Support = LabelEncoder()
data['Social_Support'] = Social_Support.fit_transform(data['Social_Support'])
data["Social_Support"].unique()

Relationship_Status = LabelEncoder()
data['Relationship_Status'] = Relationship_Status.fit_transform(data['Relationship_Status'])
data["Relationship_Status"].unique()

Substance_Use= LabelEncoder()
data['Substance_Use'] = Substance_Use.fit_transform(data['Substance_Use'])
data["Substance_Use"].unique()

Counseling_Service_Use= LabelEncoder()
data['Counseling_Service_Use'] = Counseling_Service_Use.fit_transform(data['Counseling_Service_Use'])
data["Counseling_Service_Use"].unique()

Family_History= LabelEncoder()
data['Family_History'] = Family_History.fit_transform(data['Family_History'])
data["Family_History"].unique()

Chronic_Illness= LabelEncoder()
data['Chronic_Illness'] = Chronic_Illness.fit_transform(data['Chronic_Illness'])
data["Chronic_Illness"].unique()

Extracurricular_Involvement= LabelEncoder()
data['Extracurricular_Involvement'] =  Extracurricular_Involvement.fit_transform(data['Extracurricular_Involvement'])
data["Extracurricular_Involvement"].unique()

Residence_Type = LabelEncoder()
data['Residence_Type'] = Residence_Type .fit_transform(data['Residence_Type'])
data["Residence_Type"].unique()

data = data.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
data_types =data.dtypes
# Separate numeric and categorical columns
numeric_columns = data_types[data_types == 'float64'] + data_types[data_types == 'int64']
categorical_columns = data_types[data_types == 'object']

# Display the numeric and categorical columns
print("Numeric Columns:")
print(numeric_columns)

print("\nCategorical Columns:")
print(categorical_columns)


# Remove duplicate records
data = data.drop_duplicates()

# Handle Outliers
print(data)
# Calculate summary statistics for the dataset
summary = data.describe()
Q1 = summary.loc['25%']
Q3 = summary.loc['75%']
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify and remove outliers using the IQR method
is_outlier = ((data < lower_bound) | (data > upper_bound)).any(axis=1)
data = data[~is_outlier]

# Map Stress_Level to binary values

# Map stress level values to binary categories
stress_level_mapping = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3}
data['Stress_Level'] = data['Stress_Level'].replace(stress_level_mapping)

# Drop the "Semester_Credit_Load" column
data = data.drop(columns=['Semester_Credit_Load'])

# Split the data into features (X) and the target (y)
X = data.drop(columns=['Stress_Level'])
y = data['Stress_Level']

# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Find Feature Dependencies

# Calculate the correlation matrix
correlation_matrix = data.corr()

# Get the correlation of all columns with "Stress_Level"
correlation_with_stress_level = correlation_matrix['Stress_Level']

# Sort the columns by their correlation with "Stress_Level"
sorted_correlation = correlation_with_stress_level.abs().sort_values(ascending=False)

print("Correlation with Stress_Level:\n", sorted_correlation)

# Calculate Mutual Information for Feature Selection

# Calculate mutual information between features and the target variable
mutual_info = mutual_info_classif(X, y)
mutual_info_series = pd.Series(mutual_info, index=X.columns)
mutual_info_series.sort_values(ascending=False, inplace=True)
print(mutual_info_series)

# Random Forest Classifier Model

# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV is commented out because the best parameters are already known
# grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# best_params = grid_search.best_params_

# Manually set the best hyperparameters based on the grid search results
best_params = {'max_depth': 5,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 100}

# Train a random forest classifier model with the best hyperparameters
best_model = RandomForestClassifier(random_state=42, **best_params)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Model Accuracy:", accuracy)
print("Best Hyperparameters:", best_params)
print("Classification Report:\n", report)

  data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())


Numeric Columns:
Stress_Level        NaN
Depression_Score    NaN
Anxiety_Score       NaN
Financial_Stress    NaN
dtype: object

Categorical Columns:
Series([], dtype: object)
      Age  Course  Gender  CGPA  Stress_Level  Depression_Score  \
0       2       5       1     0             3                 3   
1       2       2       0     2             3                 3   
2       1       0       0     0             3                 0   
3       1       1       1     0             2                 1   
4       1       0       1     1             3                 3   
...   ...     ...     ...   ...           ...               ...   
7017    2       3       0     0             2                 1   
7018    2       4       0     0             3                 4   
7019    2       5       1     0             3                 4   
7020    2       4       1     0             3                 3   
7021    2       4       0     0             3                 4   

      Anxiety_Score 

In [6]:
import numpy as np
X = np.array([['Adult','Others', 'Male', 'Medium', 3, 2, 'Good', 'Moderate', 'Good', 'Moderate', 'Married', 'Never', 'Never', 'No', 'No', 2, 'Moderate', 'On-Campus']])
X[:,0] = Age.transform(X[:,0])
X[:,1] = Course.transform(X[:,1])
X[:,2] = Gender.transform(X[:,2])
X[:,3] = CGPA.transform(X[:,3])
X[:, 6] = Sleep_Quality.transform(X[:,6])
X[:, 7] = Physical_Activity.transform(X[:,7])
X[:, 8] = Diet_Quality.transform(X[:,8])
X[:, 9] = Social_Support.transform(X[:,9])
X[:, 10] = Relationship_Status.transform(X[:,10])
X[:, 11] = Substance_Use.transform(X[:,11])
X[:, 12] = Counseling_Service_Use.transform(X[:,12])
X[:, 13] = Family_History.transform(X[:,13])
X[:, 14] = Chronic_Illness.transform(X[:,14])
X[:, 16] = Extracurricular_Involvement.transform(X[:,16])
#X[:, 17] = Semester_Credit_Load.transform(X[:,17])
X[:, 17] = Residence_Type.transform(X[:,17])
X = X.astype(float)
X

y_pred = best_model.predict(X)
y_pred




array([2], dtype=int64)

In [7]:
import pickle
data = {"model": best_model,"Age":Age,"Course":Course,"Gender":Gender,"CGPA":CGPA,"Sleep_Quality":Sleep_Quality,"Physical_Activity":Physical_Activity,"Diet_Quality":Diet_Quality,"Social_Support":Social_Support,"Relationship_Status":Relationship_Status,"Substance_Use":Substance_Use,"Counseling_Service_Use":Counseling_Service_Use,"Family_History":Family_History,"Chronic_Illness":Chronic_Illness,"Extracurricular_Involvement":Extracurricular_Involvement,"Semester_Credit_Load":Semester_Credit_Load,"Residence_Type":Residence_Type}
with open('steps.pkl', 'wb') as file:
    pickle.dump(data, file)



In [8]:
with open('steps.pkl', 'rb') as file:
    data = pickle.load(file)

best_model = data["model"]
Age= data["Age"]
Course= data["Course"]
CGPA= data["CGPA"]
Gender= data["Gender"]
Sleep_Quality= data["Sleep_Quality"]
Physical_Activity= data["Physical_Activity"]
Diet_Quality= data["Diet_Quality"]
Social_Support= data["Social_Support"]
Relationship_Status= data["Relationship_Status"]
Substance_Use= data["Substance_Use"]
Counseling_Service_Use= data["Counseling_Service_Use"]
Family_History= data["Family_History"]
Chronic_Illness= data["Chronic_Illness"]
Extracurricular_Involvement= data["Extracurricular_Involvement"]
#Semester_Credit_Load= data["Semester_Credit_Load"]
Residence_Type = data["Residence_Type"]


