In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
!pip install datasets
from datasets import Dataset, DatasetDict
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
!pip install accelerate -U



In [2]:
df = pd.read_csv('Combined Data.csv')

In [3]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

In [4]:
def compute_metric(logits_and_labels):
  logits, labels=logits_and_labels
  predictions=np.argmax(logits, axis=-1)
  acc=np.mean(predictions==labels)
  f1=f1_score(labels,predictions,average="macro")
  return({"acc":acc,"f1":f1})

### Pre-processing

In [5]:
df.head()

Unnamed: 0,index,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


Statistics of the data

In [6]:
status_counts = df['status'].value_counts()
print(status_counts)

status
Normal                  16351
Depression              15404
Suicidal                10653
Anxiety                  3888
Bipolar                  2877
Stress                   2669
Personality disorder     1201
Name: count, dtype: int64


Converting sentiments into numbers

In [7]:
status_mapping = {'Normal': 0, 'Depression': 1, 'Suicidal': 2, 'Anxiety': 3, 'Bipolar': 4, 'Stress': 5, 'Personality disorder': 6}
df['status'] = df['status'].map(status_mapping)
df.head()

Unnamed: 0,index,statement,status
0,0,oh my gosh,3
1,1,"trouble sleeping, confused mind, restless hear...",3
2,2,"All wrong, back off dear, forward doubt. Stay ...",3
3,3,I've shifted my focus to something else but I'...,3
4,4,"I'm restless and restless, it's been a month n...",3


Removing special characters to clean the data

In [8]:
# Function to remove special characters
def preprocess_text(text):
    #convert to string
    text = str(text)

    # Remove special characters and punctuation
    text = re.sub(r"[^\w\s]", " ", text)

    # Remove single characters
    text = re.sub(r"\b[a-zA-Z]\b", " ", text)

    # Remove HTML tags
    text = re.sub(r"<[^>]*>", " ", text)

    # Lowercase the text
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    # Trim leading and trailing spaces
    text = text.strip()

    return text

# Clean the data to remove special characters and HTML tags
df['cleaned_text'] = df['statement'].apply(preprocess_text)

# Rename rows to text and label
df.rename(columns={'cleaned_text': 'text', 'status': 'label'}, inplace=True)
df.head()

Unnamed: 0,index,statement,label,text
0,0,oh my gosh,3,oh my gosh
1,1,"trouble sleeping, confused mind, restless hear...",3,trouble sleeping confused mind restless heart ...
2,2,"All wrong, back off dear, forward doubt. Stay ...",3,all wrong back off dear forward doubt stay in ...
3,3,I've shifted my focus to something else but I'...,3,ve shifted my focus to something else but stil...
4,4,"I'm restless and restless, it's been a month n...",3,restless and restless it been month now boy wh...


## Training the model

Splitting the data into train and test datasets

In [9]:
text_column = 'text'
label_column = 'label'
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

Creating the dataset

In [10]:
train_dataset = Dataset.from_pandas(train_df[:15000])
test_dataset = Dataset.from_pandas(test_df[:3000])
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

Tokenizing the data

In [12]:
tokenizer =AutoTokenizer.from_pretrained('distilbert-base-cased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Loading a pre-trained model

In [13]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=len(status_mapping))

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining training arguments

In [14]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,

)

Training and evaluating

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

trainer.train()

Epoch,Training Loss,Validation Loss,Acc,F1
1,0.6713,0.60734,0.765,0.698519
2,0.4657,0.603406,0.791667,0.74079
3,0.3349,0.6272,0.792,0.744758


TrainOutput(global_step=5625, training_loss=0.5336220153808594, metrics={'train_runtime': 2543.307, 'train_samples_per_second': 17.693, 'train_steps_per_second': 2.212, 'total_flos': 5961564472320000.0, 'train_loss': 0.5336220153808594, 'epoch': 3.0})

Evaluating

In [16]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.6272001266479492, 'eval_acc': 0.792, 'eval_f1': 0.7447581368737406, 'eval_runtime': 53.4093, 'eval_samples_per_second': 56.17, 'eval_steps_per_second': 7.021, 'epoch': 3.0}
