In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("Combined Data.csv")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety
...,...,...,...
53038,53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety
53039,53039,"selfishness ""I don't feel very good, it's lik...",Anxiety
53040,53040,Is there any way to sleep better? I can't slee...,Anxiety
53041,53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety


In [5]:
df = df.iloc[:,1:]

In [6]:
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [7]:
# Convert text labels to numerical IDs
label_mapping = {label: i for i, label in enumerate(df["status"].unique())}
label_mapping

{'Anxiety': 0,
 'Normal': 1,
 'Depression': 2,
 'Suicidal': 3,
 'Stress': 4,
 'Bipolar': 5,
 'Personality disorder': 6}

In [8]:
df["status"] = df["status"].map(label_mapping) 

In [9]:
df.head()

Unnamed: 0,statement,status
0,oh my gosh,0
1,"trouble sleeping, confused mind, restless hear...",0
2,"All wrong, back off dear, forward doubt. Stay ...",0
3,I've shifted my focus to something else but I'...,0
4,"I'm restless and restless, it's been a month n...",0


In [10]:
df.isnull().sum()

statement    362
status         0
dtype: int64

In [11]:
df[df["statement"].isna()]

Unnamed: 0,statement,status
293,,0
572,,0
595,,0
1539,,1
2448,,1
...,...,...
52838,,0
52870,,0
52936,,0
53010,,0


In [12]:
df = df.dropna()

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52681 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     52681 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.2+ MB


In [14]:
df["statement"] = df["statement"].str.lower()

In [15]:
from datasets import Dataset

In [16]:
dataset = Dataset.from_pandas(df)

In [17]:
dataset = dataset.train_test_split(test_size=0.2)

In [18]:
train_data = dataset['train']
test_data = dataset["test"]

In [19]:
from transformers import AutoTokenizer

In [20]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [23]:
def tokenize_data(example):
    tokenized_inputs = tokenizer(example["statement"], padding="max_length", truncation=True)
    tokenized_inputs["labels"] = example["status"]  # Ensure labels are included
    return tokenized_inputs

In [24]:
train_data = train_data.map(tokenize_data, batched=True)
test_data = test_data.map(tokenize_data, batched=True)

Map:   0%|          | 0/42144 [00:00<?, ? examples/s]

Map:   0%|          | 0/10537 [00:00<?, ? examples/s]

In [25]:
from transformers import AutoModelForSequenceClassification

In [26]:
num_labels = len(label_mapping)

In [27]:
# Load RoBERTa model with classification head
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from transformers import TrainingArguments, Trainer





In [29]:
print(train_data[0].keys())

dict_keys(['statement', 'status', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'])


In [30]:
import torch

print(torch.cuda.is_available())  # True if GPU is available
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the GPU


True
1
NVIDIA GeForce RTX 4050 Laptop GPU


In [31]:
# for CPU
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# for GPU
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,  # Enable mixed precision (optional, speeds up training)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)

In [32]:
device = torch.device("cuda")
model.to(device)  # Move model to GPU

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [33]:
from accelerate import Accelerator
from accelerate.state import AcceleratorState

AcceleratorState._reset_state() 
accelerator = Accelerator()

In [34]:
trainer.train()

Step,Training Loss
500,0.8181
1000,0.6166
1500,0.559
2000,0.5396
2500,0.5094
3000,0.4433
3500,0.4209
4000,0.4204
4500,0.4236
5000,0.4049


TrainOutput(global_step=15804, training_loss=0.3191522855572808, metrics={'train_runtime': 10046.6464, 'train_samples_per_second': 25.169, 'train_steps_per_second': 1.573, 'total_flos': 6.65343006916608e+16, 'train_loss': 0.3191522855572808, 'epoch': 6.0})

In [35]:
model.to("cpu")

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [36]:
import torch

def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_label = torch.argmax(probs).item()
    return list(label_mapping.keys())[predicted_label]  # Convert back to label

In [2]:
while(1):
    a = input("Enter text: ")
    if a == "exit":
        break
    print(f"the statment is {predict_emotion(a)}")
    

KeyboardInterrupt: Interrupted by user

In [39]:
model.save_pretrained("mental_health_analysis")
tokenizer.save_pretrained("mental_health_analysis")

('mental_health_analysis\\tokenizer_config.json',
 'mental_health_analysis\\special_tokens_map.json',
 'mental_health_analysis\\vocab.json',
 'mental_health_analysis\\merges.txt',
 'mental_health_analysis\\added_tokens.json',
 'mental_health_analysis\\tokenizer.json')

In [40]:
import json

with open("label_map.json", "w") as f:
    json.dump(label_mapping, f)