In [None]:
!pip install accelerate -U

In [None]:
!pip install -q transformers datasets accelerate evaluate

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer,pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer
import torch, os
import pandas as pd
from torch.utils.data import Dataset

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
df_org = pd.read_csv("train.csv")
df_org.rename(columns={'labels': 'category','tweets':'text'},inplace=True)
df_org.head(10)

In [None]:
labels = df_org['category'].unique().tolist()
labels = [s.strip() for s in labels ]
labels

In [None]:
for key, value in enumerate(labels):
    print(value)

In [None]:
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}

In [None]:
df_org["labels"]=df_org.category.map(lambda x: label2id[x.strip()])
df_org.head(5)

In [None]:
model_path = 'finiteautomata/bertweet-base-emotion-analysis'
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=NUM_LABELS, id2label=id2label, label2id=label2id,
    ignore_mismatched_sizes=True)

model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path, max_length=512)

In [None]:
SIZE= df_org.shape[0]

train_texts= list(df_org.text[:SIZE//2])

val_texts=   list(df_org.text[SIZE//2:(3*SIZE)//4 ])

test_texts=  list(df_org.text[(3*SIZE)//4:])

train_labels= list(df_org.labels[:SIZE//2])

val_labels=   list(df_org.labels[SIZE//2:(3*SIZE)//4])

test_labels=  list(df_org.labels[(3*SIZE)//4:])

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
class DataLoader(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataloader = DataLoader(train_encodings, train_labels)

val_dataloader = DataLoader(val_encodings, val_labels)

test_dataset = DataLoader(test_encodings, test_labels)

In [None]:
def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)

    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


In [None]:
training_args = TrainingArguments(
    output_dir='/content/',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=10,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='./log',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    fp16=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics= compute_metrics
)

In [None]:
trainer.train()

In [None]:
q=[trainer.evaluate(eval_dataset=df_org) for df_org in [train_dataloader, val_dataloader, test_dataset]]

pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

## Save the model

In [None]:
model_path = "mental-health-tweets-classification-model-1"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

## load the model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mental-health-tweets-classification-model-2")
model = AutoModelForSequenceClassification.from_pretrained("mental-health-tweets-classification-model-2")
nlp= pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

## Testing the some tweets

In [None]:
df_org.head(5)
stressed_example = df_org['text'][0]
normal_example  = df_org['text'][2]
lonely_example = df_org['text'][3]
anxious_example = df_org['text'][1]
df_org.head(5)