In [9]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import torch
from torch.utils.data import DataLoader
from imblearn.over_sampling import RandomOverSampler
import os


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , classification_report
from transformers import Trainer , TrainingArguments , BertTokenizer , BertForSequenceClassification

import re
from nltk.corpus import stopwords

stopwords = set(stopwords.words("english"))

from transformers import AutoModelForSequenceClassification , AutoTokenizer

os.environ["WANDB_MODE"] = "offline"


In [None]:
path = r"/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv"

if os.path.exists(path) :
    df = pd.read_csv(path)
    print("Successfully loaded file.")

else :
    print("Sorry, your file was not found.")
    

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df = df.dropna()

In [None]:
df

In [None]:
df = df.sample(n=6000 , random_state=42).reset_index(drop=True)
df

In [None]:
df.shape

In [None]:
df = df.drop(columns=["Unnamed: 0"] , axis=1)
df

In [None]:
df.describe(include="all").T

In [None]:
df.info()

In [None]:
pd.DataFrame(df["status"].value_counts())

# Preprocessing

In [None]:
def preprocessing(text) :
    
    text = text.lower()

    text = re.sub(r"[^\w\s]" , "" , text)

    text = re.sub(r"\d+" , "" , text)

    words = text.split()

    words = [w for w in words if w not in stopwords]

    preprocessing = " ".join(words)

    return preprocessing



df["cleaned_text"] = df["statement"].apply(preprocessing)

df["statement_length"] = df["statement"].apply(len)

df["cleaned_text_length"] = df["cleaned_text"].apply(len)

In [None]:
df.head(10)

In [None]:
df["cleaned_text"][10]

In [None]:
df['status'].value_counts()

In [None]:
# Split
X = df.drop(columns=["status"] , axis=1)
y = df["status"]


In [None]:
Ra_ov_sam = RandomOverSampler(random_state=42) 
X_train_res, y_train_res = Ra_ov_sam.fit_resample(X, y) 

df = pd.concat([X_train_res, y_train_res] , axis=1)

In [None]:
pd.DataFrame(df['status'].value_counts())

In [None]:
encode = LabelEncoder()

df["status"] = encode.fit_transform(df["status"])


X = df["statement"]

y = df["status"]

In [None]:
num_labels = len(df["status"].unique())
num_labels

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=42)


print(f"X Train Shape is = {X_train.shape}")
print(f"X Test Shape is = {X_test.shape}")
print(f"y Train Shape is = {y_train.shape}")
print(f"y Test Shape is = {y_test.shape}")

In [None]:
y_train.unique()

In [None]:
max([len(c) for c in df["statement"]])

# Tokenizer 

In [None]:
# Tokenization using BERT tokenizer

tok = BertTokenizer.from_pretrained("bert-base-uncased")

train_encoding = tok(list(X_train) , padding=True , truncation=True , max_length=200)
test_encodeing = tok(list(X_test) , padding=True , truncation=True , max_length=200)


In [None]:
# Convert to Dataset format

from datasets import Dataset

df_train = Dataset.from_dict({"input_ids" : train_encoding["input_ids"] ,
                              "attention_mask" : train_encoding["attention_mask"] , 
                              "labels" : y_train.tolist()})



df_test = Dataset.from_dict({"input_ids" : test_encodeing["input_ids"] , 
                             "attention_mask" : test_encodeing["attention_mask"] , 
                             "labels" : y_test.tolist()})



# Fine Tuning Bert

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased" , num_labels=num_labels)


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    lr_scheduler_type="linear",
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=3,
    gradient_accumulation_steps=2 , 
    report_to="none"  # Disables WandB logging
)


# Trainer
trainer = Trainer(
    model=model , 
    args=training_args , 
    train_dataset=df_train , 
    eval_dataset=df_test
)


# Fine-tuning the model
trainer.train()

#  Model Evaluation


In [None]:
pred , label , _ = trainer.predict(df_test)

pred_labels = np.argmax(pred , axis=1)


In [None]:
cm = confusion_matrix(pred_labels , y_test)
cm

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(cm , annot=True , cbar=True , cmap="Blues" , xticklabels=num_labels , yticklabels=num_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(classification_report(pred_labels , y_test , target_names=encode.classes_))

# Save and Load Model and Tokenizer

In [1]:
trainer.save_model("saved_mental_status_bert")

tok.save_pretrained("saved_mental_status_bert")

import pickle

pickle.dump(encode , open("encode.pkl" , "wb"))


NameError: name 'trainer' is not defined

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("saved_mental_status_bert")

tok = AutoTokenizer.from_pretrained("saved_mental_status_bert")

encode = pickle.load(open("encode.pkl" , "rb"))


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like saved_mental_status_bert is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

# Detection System

In [None]:
def detection_text(text) :
    cleaned_text = preprocessing(text)

    inputs = tok(cleaned_text , return_tensors="pt" , padding=True , truncation=True , max_length=512)
    outputs = model(**inputs)
    logists = outputs.logits  # Purpose: Extracts the logits from the model outputs. Logits are the raw, unnormalized scores for each class before applying softmax.
    pred_classes = torch.argmax(logists , dim=1).item()
    return encode.inverse_transform([pred_classes])[0]


sample_texts = [
    "I feel perfectly fine today, nothing to worry about.",
    "I can't stop thinking about what will happen if things go wrong.",
    "Lately, I've been on a high, feeling like I can do anything!",
    "I'm so sad, I just can't seem to get out of bed anymore.",
    "I'm constantly thinking about how much better everyone else is doing than me.",
    "I don't think I can keep going, everything feels so hopeless.",
    "I had a really good day, spent some time with my friends.",
    "I'm overwhelmed by the idea that I might lose everything.",
    "I feel like nothing matters anymore, I just want to give up.",
    "I'm okay today, but sometimes I get really anxious for no reason."
]


for t in sample_texts :
    pred_classes = detection_text(t)
    print(f"Sentence: {t}\nPredicted class: ( {pred_classes} )\n")


# End 