**Objective :**
To classify mental healthâ€“related text data using machine learning techniques in order to detect risk patterns and support timely intervention.



Import all Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load dataset

**Problem Statemen:**
Mental health-related text data is:

Unstructured

Noisy (contains slang, stopwords, typos)

Difficult to analyze manually at scale


In [None]:
df = pd.read_csv("/content/mental_health.csv")

In [None]:
df.info() # Information about data

Load dataset using pandas

Inspect shape, columns, and missing values

In [None]:
df.head() # Top 5 Rows

In [None]:
df.duplicated().sum() # Total duplicates present in data

In [None]:
df.drop_duplicates(inplace = True) #Removing duplicates in place

In [None]:
df.isna().sum() # Checking Null values

In [None]:
df.isna().sum()

In [None]:
df["label"].value_counts() #to check data balanced or ot

In [None]:
df.loc[df["label"] == 0,["label"]] = "NO" #changing values in Target columns

In [None]:
df.loc[df["label"] == 1,["label"]] = "YES"

In [None]:
df[df["label"] == "YES"]

In [None]:
df["label"].value_counts(normalize = True) * 100

In [None]:
y = df["label"]

In [None]:
X = df["text"]

In [None]:
y.head()

In [None]:
y.value_counts()

In [None]:
X.shape

In [None]:
y.shape

Split data into training and testing sets

Maintained proper separation to avoid data leakage


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 8)

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

The dataset consists of textual user inputs related to mental health

Target variable represents risk classification (Yes / No)

Data contains:

Stopwords

Punctuation

Irrelevant words

Class imbalance

In [None]:
import nltk
import re
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
def Clean(texts):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    cleaned_texts = []

    for doc in texts:
        doc = re.sub(r"[^a-zA-Z\s]", "", doc)
        doc = doc.lower()
        tokens = nltk.word_tokenize(doc)
        tokens = [w for w in tokens if w not in stop_words]
        tokens = [lemmatizer.lemmatize(w) for w in tokens]
        cleaned_texts.append(" ".join(tokens))

    return cleaned_texts


Used TF-IDF Vectorization to convert text into numerical features

Why TF-IDF?

Captures word importance

Reduces impact of frequent but unimportant words

Works well for text classification tasks

In [159]:
#apply countvectorizer as tfidf bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
#tfidf = vectorizer.fit_transform()
#print("shape of bow",tfidf.shape)
#print("Vocabulary:",vectorizer.get_feature_names_out())

**KNeigbors Classifier Algorithm:**


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Now using the vectorized Clean function directly
knn_pipe = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("knn", KNeighborsClassifier())
])

knn_pipe.fit(X_train, y_train)

Model Prediction

In [None]:
y_pre = knn_pipe.predict(X_test)

Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pre,y_test)
print("KNeighbors Classifier Score:",score)

Hyper Parameter Tuning

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{"knn__n_neighbors":[i for i in range(1,10,2)],"knn__p" : [1,2,3]}]
grid_model_knn = GridSearchCV(
    estimator = knn_pipe,
    param_grid = tuned_parameters,
    scoring = "accuracy",
    cv = 5,
    return_train_score = True,
    verbose = True
)
grid_model_knn.fit(X_train,y_train_encoded)

To see best Parameters

In [None]:
print("Best KNN Params:", grid_model__knn.best_params_)
print("Best KNN Score:", grid_model_knn.best_score_)

**Naive Bayes Algorithm :**

In [None]:
from sklearn.naive_bayes import MultinomialNB
# Fixed: Removed lambda, using Clean directly
nb_pipe = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("nb", MultinomialNB())
])
nb_pipe.fit(X_train, y_train)

Model Prediction

In [None]:
y_pred = nb_pipe.predict(X_test)

Model Eavluation

In [None]:
from sklearn.metrics import accuracy_score
# Map y_test to numeric values to match the predictions in y_pred
#y_test_numeric = y_test.map({'NO': 0, 'YES': 1})
score = accuracy_score(y_test, y_pred)
print("Naive Bayes score :",score)

Hyper Paramater Tuning

In [None]:

param_grid = {"nb__var_smoothing": [1e-12, 1e-10, 1e-8, 1e-6]}

grid = GridSearchCV(
    estimator = nb_pipe,
    param_grid = param_grid,
    cv=5,
    scoring="accuracy"
)

grid.fit(X_train, y_train)

To see Best Parameters

In [None]:
print("Best KNN Params:", grid_fit.best_params_)
print("Best KNN Score:", grid_fit.best_score_)

**Decision Tree Classifier Algorithm:**

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Fixed: Removed lambda, using Clean directly
dt_pipe = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("dt", DecisionTreeClassifier(max_depth = 28))
])
dt_pipe.fit(X_train, y_train)

Model Prediction

In [None]:
y_pred = dt_pipe.predict(X_test)

Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred,y_test)
print("Decison Tree classifier score",score)

Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{"dt__max_depth":[i for i in range(1,21)]}]

clf = GridSearchCV(
    estimator = dt_pipe,
    param_grid = tuned_parameters,
    scoring = "accuracy",
    cv = 5,
    return_train_score = True,
    verbose = 1 # optional
)
clf.fit(X_train_trans,y_train)

To get Best Parameters

In [None]:
print("Best KNN Params:", clf_fit.best_params_)
print("Best KNN Score:", clf_fit.best_score_)

**Logistic Regression :**

In [None]:
from sklearn.linear_model import LogisticRegression
# Fixed: Removed lambda, using Clean directly
pipe_lr = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("lr", LogisticRegression())
])
pipe_lr.fit(X_train, y_train)

Model Prediction and Evaluation

In [None]:
y_pred = pipe_lr.predict(X_test)
score = accuracy_score(y_pred,y_test)
print("Logistic Regression score :",score)

Hyper Parameter

In [None]:
param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_df": [0.9, 0.95],
    "tfidf__min_df": [2, 5],

    "lr__C": [0.1, 1, 10],
    "lr__penalty": ["l2"],
    "lr__solver": ["lbfgs"]}
  grid = GridSearchCV(
    estimator=pipe_lr,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

To get Best Parameters

In [None]:
print("Best KNN Params:", grid_fit.best_params_)
print("Best KNN Score:", grid_fit.best_score_)

**Support Vector Machine :**
Support Vector Classifier Building Using Pipeline

In [None]:
from sklearn.svm import SVC
pipe_svm = Pipeline(["clean",FunctionTransformer(Clean,validate = False),("tfidf", TfidfVectorizer()),("svm",SVC())])
pipe_svm.fit(X_train,y_train)

Model Prediction and Evaluation

In [None]:
y_pred = pipe_svm.predict(X_test)
score = accuracy_score(y_pred,y_test)
print("Support Vector Classifier score :",score)

Classes in the Target Column

In [None]:
# To access the classes, you need to reference the 'svm' step within the pipeline
pipe_svm.named_steps['svm'].classes_

Hyper Parameter Tuning

In [None]:
tuned_parameters = [{"svm__C":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]},{"svm__kernel":["linear","poly","rbf"]}]
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(
    estimator = pipe_svm,
    param_grid = tuned_parameters,
    cv = 5,
    scoring = "accuracy",
    verbose = 1,
    return_train_score = True
)
model.fit(X_train,y_train)

To get Best Score

In [None]:
print("Best KNN Params:", model.best_params_)
print("Best KNN Score:", model.best_score_)

**Random Forest :**
Random Forest Classifier using Pipeline

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Fixed: Removed lambda, using Clean directly
pipe_rf = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("rf", RandomForestClassifier())
])
pipe_rf.fit(X_train, y_train)

Model Prediction and Evaluation

In [None]:
y_pred = pipe_rf.predict(X_test)
score = accuracy_score(y_pred,y_test)
print("Random Forest Classifier score :",score)

Hyper Parameter Tuning

In [None]:
param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_df": [0.9, 0.95],
    "tfidf__min_df": [2, 5],

    "rf__n_estimators": [100, 200],
    "rf__max_depth": [None, 10, 20],
    "rf__min_samples_split": [2, 5],
    "rf__min_samples_leaf": [1, 2],
    "rf__max_features": ["sqrt", "log2"]}
grid = GridSearchCV(
    estimator=pipe_rf,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1)
grid.fit(X_train, y_train)


To get Best Parameters and score

In [None]:
print("Best KNN Params:", grid.best_params_)
print("Best KNN Score:", grid.best_score_)

**Voting Classifier :** Building a Model using Hyper parameter

In [None]:
from sklearn.ensemble import VotingClassifier
estimators = ([("knn",KNeighborsClassifier()),("nb",MultinomialNB()),("dt",DecisionTreeClassifier())])
# Fixed: Removed lambda, using Clean directly
pipe_vc = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("vc", VotingClassifier(estimators, voting = "soft"))
])
pipe_vc.fit(X_train, y_train)

Model Prediction and Evaluation

In [None]:
y_pred = pipe_vc.predict(X_test)
score = accuracy_score(y_pred,y_test)
print("Voting Classifier score",score)

Hyper Parameter Tuning

In [None]:
param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "vc__lr__C": [0.1, 1, 10],
    "vc__nb__alpha": [0.1, 0.5, 1.0]}
grid = GridSearchCV(
    pipe_vc,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1)

grid.fit(X_train, y_train)


To get Paramateres and Score

In [None]:
print("Best KNN Params:", grid.best_params_)
print("Best KNN Score:", grid.best_score_)

**Stacking Classifier :**
Building Classifier Model Using Pipeline

In [None]:
from sklearn.ensemble import StackingClassifier
# Fixed: Removed lambda, using Clean directly
pipe_sc = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("sc", StackingClassifier(estimators))
])
pipe_sc.fit(X_train, y_train)

Model Prediction and Evaluation

In [None]:
y_pred = pipe_sc.predict(X_test)
score = accuracy_score(y_pred,y_test)
print("Stacking Classifier score",score)

Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "sc__lr__C": [0.1, 1, 10],
    "sc__nb__alpha": [0.1, 0.5, 1.0]
}

grid = GridSearchCV(
    pipe_sc,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)


Best Parameters and Score

In [None]:
print("Best KNN Params:", grid.best_params_)
print("Best KNN Score:", grid.best_score_)

**Ada Boost Classifier :**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
# Fixed: Removed lambda, using Clean directly
pipe_ada = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("ada", AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=200, learning_rate=0.5))
])
pipe_ada.fit(X_train, y_train)

Model Prediction and Evaluation

In [None]:
y_pred = pipe_ada.predict(X_test)
score = accuracy_score(y_pred,y_test)
print("Ada Boost Classifier score :",score)

Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "ada__n_estimators": [50, 100, 200],
    "ada__learning_rate": [0.01, 0.1, 1],
    "ada__base_estimator__max_depth": [1, 2]
}

grid = GridSearchCV(
    pipe_ada,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)


parameters and score

In [None]:
print("Best KNN Params:", grid.best_params_)
print("Best KNN Score:", grid.best_score_)

**Gradient Boosting Classifier :**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Fixed: Removed lambda, using Clean directly
pipe_gb = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("gb", GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3))
])
pipe_gb.fit(X_train, y_train)

Model Prediction and Evaluation

In [None]:
y_pred = pipe_gb.predict(X_test)
score = accuracy_score(y_pred,y_test)
print("Gradient Boosting Classifier score :",score)

Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "gb__n_estimators": [100, 200, 300],
    "gb__learning_rate": [0.01, 0.1, 0.2],
    "gb__max_depth": [3, 5, 7],
    "gb__subsample": [0.8, 1.0]
}

grid = GridSearchCV(
    pipe_gb,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)


Best Parameters and Score

In [None]:
print("Best KNN Params:", grid.best_params_)
print("Best KNN Score:", grid.best_score_)

**Extreme Gradient Boosting Classifier :**

In [None]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# XGBoost requires numeric labels (0, 1)
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# Fixed: Removed lambda, using Clean directly
pipe_xgb = Pipeline(steps = [
    ("clean", FunctionTransformer(Clean, validate = False)),
    ("tfidf", TfidfVectorizer()),
    ("xgb", XGBClassifier())
])
pipe_xgb.fit(X_train, y_train_encoded)

Model Prediction and Evaluation

In [None]:
# 1. Predict numeric values (0, 1)
y_pred_num = pipe_xgb.predict(X_test)

# 2. Convert numeric predictions back to original 'NO'/'YES' labels
y_pred = le.inverse_transform(y_pred_num)

# 3. Calculate score using the converted labels
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {score}")
print(f"Sample predictions: {y_pred[:5]}")

Hyper parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "xgb__n_estimators": [100, 200, 300],
    "xgb__learning_rate": [0.01, 0.1, 0.2],
    "xgb__max_depth": [3, 5, 7],
    "xgb__subsample": [0.8, 1.0],
    "xgb__colsample_bytree": [0.8, 1.0]
}

grid = GridSearchCV(
    pipe_xgb,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)


To get Best Score and Parameters

In [None]:
print("Best KNN Params:", grid.best_params_)
print("Best KNN Score:", grid.best_score_)

**Pickle:**
Pickle is a Python module used to serialize and deserialize machine learning objects such as models and pipelines. In machine learning projects, pickle allows us to save a trained model to a file and reuse it later without retraining, which saves time and computational resources.**bold text**

In [None]:
import pickle
with open("pipe_svm.pkl","wb") as file:
  pickle.dump(pipe_svm,file)