<a href="https://colab.research.google.com/github/Muyiiwaa/machine_learning_notes/blob/master/natural_language_processing_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import kagglehub
import torch
from torch import nn, optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score, precision_score, recall_score
import wandb
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
# Download latest version
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sentiment-analysis-for-mental-health


In [3]:
data_url = os.path.join(path,os.listdir(path)[0])
data_url

'/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv'

In [4]:
data = pd.read_csv(data_url)
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [5]:
data['status'].unique()

array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [6]:
data = data[['statement', 'status']]
data.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [7]:
data.rename(columns = {
    'statement': 'text',
    'status': 'label'
}, inplace = True)

data.head()

Unnamed: 0,text,label
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [8]:
encoder = LabelEncoder()

data['label'] = encoder.fit_transform(data['label'])
data.head()

Unnamed: 0,text,label
0,oh my gosh,0
1,"trouble sleeping, confused mind, restless hear...",0
2,"All wrong, back off dear, forward doubt. Stay ...",0
3,I've shifted my focus to something else but I'...,0
4,"I'm restless and restless, it's been a month n...",0


In [9]:
data['label'].unique()

array([0, 3, 2, 6, 5, 1, 4])

In [10]:
# downloaded the tokenizer
model_uri = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_uri)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
# split the dataset

train_df, test_df = train_test_split(data, test_size=0.2, random_state = 23,
                                     stratify = data['label'])

In [45]:
# setup the data object


class MentalData(Dataset):

  def __init__(self, dataframe:pd.DataFrame, tokenizer, max_length=128):
    self.texts = dataframe['text'].astype('str').to_list()
    self.labels = dataframe['label'].to_list()
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, index):
    text = self.texts[index]
    label = self.labels[index]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        return_tensors = 'pt',
        padding = 'max_length',
        truncation = True,
        max_length = self.max_length
    )
    input_ids = encoding['input_ids'].flatten()
    attention_mask = encoding['attention_mask'].flatten()

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label': torch.tensor(data=label, dtype = torch.long)
    }

In [46]:
# create the data object instance
train_data = MentalData(train_df, tokenizer)
test_data = MentalData(test_df, tokenizer)

In [47]:
train_data[17]

{'input_ids': tensor([  101,  2893, 15035,  1077,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [48]:
data['text'].to_list()[0]

'oh my gosh'

In [49]:
# init the model and model hyperparameters.

model = AutoModelForSequenceClassification.from_pretrained(model_uri, num_labels =7)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.AdamW(params = model.parameters(), lr = 1e-4, weight_decay = 0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size = 2, gamma = 0.1)
EPOCHS = 5
BATCH_SIZE = 12

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
train_loader = DataLoader(dataset = train_data, batch_size=BATCH_SIZE, shuffle = True)
test_loader = DataLoader(dataset = test_data, batch_size=BATCH_SIZE, shuffle = True)

In [51]:
# setup the training and validation loop

def training(epoch: int) -> tuple[float]:
  epoch = epoch + 1
  loss_list, pred_list, label_list = [],[],[]
  model.train()
  train_progress = tqdm(train_loader, desc= f'Training epoch: {epoch}')
  model.train()
  for batch in train_progress:
    input_ids, attention_mask, label = batch['input_ids'].to(device), batch['attention_mask'].to(device),batch['label'].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
    loss = outputs.loss

    # backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # unpack the important metrics
    loss_list.append(loss.item())
    label_list.extend(label.cpu().detach().numpy())
    _,preds = torch.max(outputs.logits, 1)
    pred_list.extend(preds.cpu().detach().numpy())

    train_progress.set_postfix(loss=loss.item())

  # compute the metrics
  final_loss = sum(loss_list)/len(loss_list)
  f1 = f1_score(label_list, pred_list)
  precision = precision_score(label_list, pred_list)
  recall = recall_score(label_list, pred_list)

  scheduler.step()

  print(f""" Completed training epoch: {epoch}: \n
  ==============================================
  epoch loss: {final_loss}
  epoch_f1: {f1}
  epoch_precision: {precision}
  epoch_recall: {recall}
  """)

  return final_loss, f1, precision, recall


In [52]:
# define the validation loop

def validate(epoch: int) -> tuple[float]:
  epoch = epoch + 1
  model.eval()
  loss_list, pred_list, label_list = [],[],[]
  with torch.no_grad():
    test_progress = tqdm(test_loader, desc= f'testing epoch: {epoch}')
    for batch in test_progress:
      input_ids, attention_mask, label = batch['input_ids'].to(device), batch['attention_mask'].to(device),batch['label'].to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
      loss = outputs.loss

      loss_list.append(loss.item())
      label_list.extend(label.cpu().detach().numpy())
      _,preds = torch.max(outputs.logits, 1)
      pred_list.extend(preds.cpu().detach().numpy())

      test_progress.set_postfix(loss=loss.item())

  # compute the metrics
  final_loss = sum(loss_list)/len(loss_list)
  f1 = f1_score(label_list, pred_list)
  precision = precision_score(label_list, pred_list)
  recall = recall_score(label_list, pred_list)

  print(f""" Completed testing epoch: {epoch}: \n
  ==============================================
  epoch loss: {final_loss}
  epoch_f1: {f1}
  epoch_precision: {precision}
  epoch_recall: {recall}
  """)

  return final_loss, f1, precision, recall

In [None]:
# add training and test together

# setup weight and bias
run = wandb.init(
    project = "Mental Health Text Classificatio Project",
    config = {
        'model' : model_uri,
        'device' : torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        'optimizer' : 'adamW',
        'scheduler' : '{step_size : 2, gamma : 0.1}',
        'epochs': EPOCHS
        'batch_size': BATCH_SIZE,
        'learning rate': 1e-4
    })

for epoch in range(EPOCHS):
  train_loss, train_f1, train_precision, train_recall = training(epoch=epoch)
  print(f'Now testing...')
  test_loss, test_f1, test_precision, test_recall = validate(epoch=epoch)
  run.log(
      {
          'epoch':epoch,
          'train_loss': train_loss,
          'train_f1': train_f1,
          'train_recall': train_recall,
          'train_precision': train_precision,
          'test_loss': test_loss,
          'test_f1': test_f1,
          'test_recall': test_recall,
          'test_precision': test_precision,
      })

run.finish()

https://www.youtube.com/playlist?list=PLf43guw17cen5G8c3-Emt5v_GGck4LcIn