## Google Colaboratory Notebook

In [None]:
import io
from google.colab import drive, files
import pandas as pd
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.colab import auth
auth.authenticate_user()  # verify your account to read files which you have access to. Make sure you have permission to read the file!
from oauth2client.client import GoogleCredentials
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

!pip install transformers
!pip install nlp
!pip install sentencepiece
!pip install emoji
!pip install pytorch_lightning
import transformers
from transformers import AutoTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

To mount in a folder of google drive

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
#Prepare sample 1 of the data
df_1 = pd.read_csv("Labellisation data personnels long covid.xlsx - sample 1.csv")
df_1 = df_1[:696]
df_1 = df_1[["text","Personal (1=yes, 0=no) Hanin"]].rename(columns = {'Personal (1=yes, 0=no) Hanin':'Labels'})

In [None]:
#Prepare sample 2 of the data
df_2 = pd.read_csv("Labellisation data personnels long covid.xlsx - sample 2.csv")
df_2 = df_2[:700]
df_2 = df_2[["text",'Personal (1=yes, 0=no) Charline']].rename(columns = {'Personal (1=yes, 0=no) Charline':'Labels'})

In [None]:
#Merge both datasets
df = pd.concat([df_1, df_2])
df = df.reset_index(drop=True)
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df = df.dropna()
print(type(df["Labels"][0]),type(df["Labels"][1]))
df = df.astype({"Labels": int})

In [None]:
#Check labels distribution
print(df.Labels.value_counts())

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
# Normalisation for BertTweet
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re

# using tweet tokenizer for the sample reddit text due to similar natures of the text
tokenizer = TweetTokenizer()

# https://huggingface.co/vinai/bertweet-base
def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

def normalizeTweet(tweet):

    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
    
    return " ".join(normTweet.split())

FROM HUGGINGFACE: https://huggingface.co/transformers/training.html

In [None]:
from transformers import BertForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
#from nlp import load_dataset
from transformers import DistilBertTokenizerFast
import torch

data = df #df_old, #df2
text = data["text"].map(normalizeTweet).values.tolist()
labels = data["Labels"].values.tolist()
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=0.33)
train_texts, val_texts, train_labels, val_labels = train_test_split(text, labels, test_size=0.2)
print("Train: {}".format(len(train_texts)))
print("Val: {}".format(len(val_texts)))
print("Test: {}".format(len(test_texts)))
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
#tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# truncation, padding = true ensures that all sentences are padded to the same length and are truncated to be no longer model's max input lengts
# => allows to feed batches of sequences 
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
print(len(train_encodings))
print(len(test_encodings))
print(len(val_encodings))
print(val_encodings.keys())
#model = BertForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2) # create BERT Model with weights and randomly initializes sequence classification head with output size  2
#model.train() # puts model in train mode

In [None]:
# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
      
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataSet(train_encodings, train_labels)
val_dataset = TweetDataSet(val_encodings, val_labels)
test_dataset = TweetDataSet(test_encodings, test_labels)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))


Two ways to fine-tune model now that the datasets are ready: 

1) Using Trainer
2) Native PyTorch

In [None]:
# 1) Trainer 
import os
os.environ["WANDB_DISABLED"] = "true"
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
CUDA_LAUNCH_BLOCKING=1
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10
)

# AutoModelForSequenceClassification adds a fully connected layer after BERT
model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
eval_output = trainer.evaluate(test_dataset) 
print(eval_output)

Save fine-tuned model 

In [None]:
trainer.save_model("drive/MyDrive/Data_Files/Fine-tuned_models/Own_long_covid_classifier_06072022")