In [None]:
pip install datasets


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import dataloader
from datasets import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")

print("Path to dataset files:", path)

In [None]:
import os
# List the contents of the directory to find the actual CSV file
dataset_path = "/kaggle/input/sentiment-analysis-for-mental-health"
# print(os.listdir(dataset_path)) # No longer needed after identifying the file

# Load the CSV file into a pandas DataFrame
data = pd.read_csv(os.path.join(dataset_path, 'Combined Data.csv'))
data

In [None]:
data.dropna(inplace=True)
data.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
data

In [None]:
data=data.sample(n=6000, random_state=42).reset_index(drop=True)
data.shape

DATA PREPROCESSING

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords') # Download stopwords corpus

stop_words=set(stopwords.words('english'))

def clean_statement(text):
  text=text.lower()

  text=re.sub(r"[^a-zA-Z\s]",'',text)

  words=text.split()
  words=[word for word in words if word not in stop_words]

  return " ".join(words)

data['statement']=data['statement'].apply(clean_statement)
data

In [None]:
data['status'].value_counts()

Need to balance dataset there are three ways
1) oversampling
2) undersampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros=RandomOverSampler(sampling_strategy='auto', random_state=42)
x=data.drop('status', axis=1)
y=data['status']

x_resampled, y_resampled=ros.fit_resample(x,y)

data=pd.concat([x_resampled, y_resampled], axis=1)

print(data['status'].value_counts())

Encoding

In [None]:
label_encoder=LabelEncoder()
data['label']=label_encoder.fit_transform(data['status'])
data


In [None]:
train_texts, test_texts, train_labels, test_labels=train_test_split(data['statement'], data['label'], test_size=0.2)

Tokenization ( llm ) ( pre-trained tokenizer BERT)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_texts), padding=True, truncation=True, max_length=200)
test_encodings = tokenizer(list(test_texts), padding=True, truncation=True, max_length=200)

In [None]:
train_dataset=Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels.to_list()})
test_dataset=Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels.to_list()})

Fine Tuning the Model

In [None]:
model=BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # Corrected from evaluation_strategy to eval_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    fp16=True,                       # Enables Mixed Precision (Massive speedup on T4)
    per_device_train_batch_size=32,  # Doubled from 16
    per_device_eval_batch_size=32,   # Doubled from 16
    gradient_accumulation_steps=1,   # Ensure this is the only instance of this argument
    dataloader_num_workers=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    lr_scheduler_type="linear",
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=3
)

trainer=Trainer(model=model, args= training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset)
trainer.train()

In [None]:
predictions, _, _ = trainer.predict(test_dataset)
predicted_labels=np.argmax(predictions, axis=1)
print(classification_report(test_labels, predicted_labels,target_names=label_encoder.classes_))
cm=confusion_matrix(test_labels, predicted_labels)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
trainer.save_model("/content/drive/MyDrive/saved_mental_bert")

In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/saved_mental_status_bert")

In [None]:
import pickle

pickle.dump(label_encoder, open('/content/drive/MyDrive/label_encoder.pkl', 'wb'))

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model=AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/saved_mental_bert")
tokenizer=AutoTokenizer.from_pretrained("/content/drive/MyDrive/saved_mental_status_bert")
#
label_encoder=pickle.load(open('/content/drive/MyDrive/label_encoder.pkl', 'rb'))
#

We are now testing the model


In [None]:
import torch
def dectection(text):
  text=clean_statement(text)
  inputs=tokenizer(text, padding=True, truncation=True, max_length=200, return_tensors='pt')
  outputs=model(**inputs)
  logits=outputs.logits
  predicted_labels=torch.argmax(logits, dim=1).item()

  return label_encoder.inverse_transform([predicted_labels])[0]

text="i am not okay i dont feel nice"
dectection(text)