<a href="https://colab.research.google.com/github/MeLLL-UFF/bambas/blob/feat%2Fbertweet/bertweet/bertweet_ptc_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install -q transformers pandas matplotlib torch numpy datasets scikit_learn notebook matplotlib emoji==0.6.0 nltk bitsandbytes==0.40.2 accelerate==0.21.0

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
from torch.utils.data import DataLoader
from emoji import demojize
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, multilabel_confusion_matrix
from google.colab import drive
drive.mount("/gdrive")
WORKDIR="/gdrive/My Drive/projects/bambas/bertweet"

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
%pwd

'/content'

In [4]:
train = pd.read_csv(f'{WORKDIR}/dataset/ptc_preproc_train.csv', sep=";")
test = pd.read_csv(f'{WORKDIR}/dataset/ptc_preproc_test.csv', sep=";")
train

Unnamed: 0.1,Unnamed: 0,text,label,category
0,0,Stop Islamization of America.\t,Slogans,Call
1,1,We condemn all those whose behaviours and view...,Black-and-White_Fallacy,
2,2,Defeat Jihad`,Slogans,Call
3,3,the nation that gave the world the Magna Carta...,Loaded_Language,Manipulative_wording
4,4,The UK should never become a stage for inflamm...,Flag-Waving,Justification
...,...,...,...,...
4945,4945,sham,Loaded_Language,Manipulative_wording
4946,4946,evidently completely false,Name_Calling_Labeling,Attack_on_reputation
4947,4947,Russian collaborators,Name_Calling_Labeling,Attack_on_reputation
4948,4948,We have previously shown that the Guardian eve...,Reductio_ad_hitlerum,


In [5]:
train[train["text"].isnull()]

Unnamed: 0.1,Unnamed: 0,text,label,category
3348,3348,,Loaded_Language,Manipulative_wording


In [6]:
train.dropna(subset=["text", "label"], inplace=True)

In [7]:
train.duplicated(subset=["text"]).sum()

485

In [8]:
train.drop_duplicates(subset=["text"], inplace=True)

In [9]:
train.shape

(4464, 4)

In [10]:
test

Unnamed: 0.1,Unnamed: 0,text,label,category
0,0,The next transmission could be more pronounced...,Appeal_to_Authority,Justification
1,1,when (the plague) comes again it starts from m...,Appeal_to_Authority,Justification
2,2,appeared,Doubt,Attack_on_reputation
3,3,"a very, very different",Repetition,Manipulative_wording
4,4,He also pointed to the presence of the pneumon...,Appeal_to_fear-prejudice,Justification
...,...,...,...,...
1376,1376,Trump-hating Republican,Name_Calling_Labeling,Attack_on_reputation
1377,1377,grave hardship,Loaded_Language,Manipulative_wording
1378,1378,unbelievably rude,Name_Calling_Labeling,Attack_on_reputation
1379,1379,wonderful woman,Name_Calling_Labeling,Attack_on_reputation


In [11]:
test[test["text"].isnull()]

Unnamed: 0.1,Unnamed: 0,text,label,category


In [12]:
test.duplicated(subset=["text"]).sum()

171

In [13]:
test.drop_duplicates(subset=["text"], inplace=True)

In [14]:
train["label"].value_counts()

Loaded_Language                                               1595
Name_Calling_Labeling                                          824
Doubt                                                          408
Exaggeration_Minimisation                                      349
Repetition                                                     230
Causal_Oversimplification                                      162
Appeal_to_fear-prejudice                                       160
Flag-Waving                                                    144
Slogans                                                         95
Black-and-White_Fallacy                                         91
Appeal_to_Authority                                             86
Thought-terminating_Cliches                                     57
Whataboutism                                                    52
Reductio_ad_hitlerum                                            38
Red_Herring                                                   

In [15]:
train["label"].describe()

count                4464
unique                 60
top       Loaded_Language
freq                 1595
Name: label, dtype: object

In [16]:
test["label"].value_counts()

Loaded_Language                                                                      383
Name_Calling_Labeling                                                                177
Appeal_to_fear-prejudice                                                             110
Repetition                                                                            99
Exaggeration_Minimisation                                                             71
Flag-Waving                                                                           70
Doubt                                                                                 69
Appeal_to_Authority                                                                   36
Causal_Oversimplification                                                             30
Slogans                                                                               30
Black-and-White_Fallacy                                                               23
Whataboutism         

In [17]:
test["label"].describe()

count                1210
unique                 41
top       Loaded_Language
freq                  383
Name: label, dtype: object

In [18]:
train["text"].str.len().describe()

count    4464.000000
mean       52.327285
std        66.162997
min         3.000000
25%        16.000000
50%        27.000000
75%        60.000000
max       799.000000
Name: text, dtype: float64

In [19]:
train["tokens_per_text"] = train["text"].str.split().str.len()
train["tokens_per_text"].describe()

count    4464.000000
mean        8.622312
std        11.305836
min         1.000000
25%         2.000000
50%         4.000000
75%        10.000000
max       141.000000
Name: tokens_per_text, dtype: float64

In [20]:
test["tokens_per_text"] = test["text"].str.split().str.len()
test["tokens_per_text"].describe()

count    1210.000000
mean        7.338843
std         9.341914
min         1.000000
25%         2.000000
50%         4.000000
75%         9.000000
max        80.000000
Name: tokens_per_text, dtype: float64

In [21]:
train["tokens_per_text"] = train["text"].str.split().str.len()
train["tokens_per_text"].describe()

count    4464.000000
mean        8.622312
std        11.305836
min         1.000000
25%         2.000000
50%         4.000000
75%        10.000000
max       141.000000
Name: tokens_per_text, dtype: float64

In [22]:
# Seems to not be necessary as we use BERTweet default tokenizer with normalization enabled
# from https://github.com/VinAIResearch/BERTweet/blob/master/TweetNormalizer.py
# twitter_tokenizer = TweetTokenizer()

# def normalizeToken(token):
#     lowercased_token = token.lower()
#     if token.startswith("@"):
#         return "@USER"
#     elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
#         return "HTTPURL"
#     elif len(token) == 1:
#         return demojize(token)
#     else:
#         if token == "’":
#             return "'"
#         elif token == "…":
#             return "..."
#         else:
#             return token


# def normalizeTweet(tweet):
#     tokens = twitter_tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
#     normTweet = " ".join([normalizeToken(token) for token in tokens])

#     normTweet = (
#         normTweet.replace("cannot ", "can not ")
#         .replace("n't ", " n't ")
#         .replace("n 't ", " n't ")
#         .replace("ca n't", "can't")
#         .replace("ai n't", "ain't")
#     )
#     normTweet = (
#         normTweet.replace("'m ", " 'm ")
#         .replace("'re ", " 're ")
#         .replace("'s ", " 's ")
#         .replace("'ll ", " 'll ")
#         .replace("'d ", " 'd ")
#         .replace("'ve ", " 've ")
#     )
#     normTweet = (
#         normTweet.replace(" p . m .", "  p.m.")
#         .replace(" p . m ", " p.m ")
#         .replace(" a . m .", " a.m.")
#         .replace(" a . m ", " a.m ")
#     )

#     return " ".join(normTweet.split())

In [23]:
model_name = "vinai/bertweet-base"

################################################################################
# bitsandbytes parameters
################################################################################
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
################################################################################
# SFT parameters
################################################################################
# Load the entire model on the GPU 0
device_map = {"": 0}

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    # Activate 4-bit precision base model loading
    load_in_4bit=True,
    # Quantization type (fp4 or nf4)
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    # Activate nested quantization for 4-bit base models (double quantization)
    bnb_4bit_use_double_quant=False,
)

# Max length of a sequence for the model https://github.com/VinAIResearch/BERTweet/tree/master#models2
max_seq_len=128
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, normalization=True)
model = AutoModel.from_pretrained(model_name, quantization_config=bnb_config, device_map=device_map)

In [24]:
data = pd.concat((train["text"], test["text"]))
tokens = data.apply((lambda x: tokenizer.encode(
    x,
    truncation=True,
    max_length=max_seq_len
)))
tokens

0                   [0, 1267, 32789, 6680, 15, 885, 4, 2]
1       [0, 134, 19715, 48, 268, 3430, 45973, 5094, 13...
2                              [0, 40212, 41645, 1654, 2]
3       [0, 6, 2580, 25, 867, 6, 220, 6, 4658, 880, 15...
4       [0, 47, 922, 151, 143, 870, 11, 1719, 19, 4982...
                              ...                        
1375                    [0, 11, 200, 847, 14270, 4312, 2]
1376                            [0, 46004, 4542, 4265, 2]
1377                                  [0, 9055, 43899, 2]
1378                                  [0, 26730, 2915, 2]
1379                                    [0, 1576, 750, 2]
Name: text, Length: 5674, dtype: object

In [25]:
padded = np.array([token+[0]*(max_seq_len-len(token)) for token in tokens.values])
np.array(padded).shape

(5674, 128)

In [26]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(5674, 128)

In [27]:
input_ids = torch.tensor(padded).to(model.device)
attention_mask = torch.tensor(attention_mask).to(model.device)

ds = Dataset.from_dict({"input_ids": input_ids, "attention_mask": attention_mask}).with_format("torch")
dataloader = DataLoader(ds, batch_size=64)

last_hidden_states = torch.tensor([]).to(model.device)
with torch.no_grad():
    for idx, batch in enumerate(dataloader):
      print(f'Running inference with batch no. {idx+1}')
      last_hidden_states_for_batch = model(batch["input_ids"], attention_mask=batch["attention_mask"])
      print(last_hidden_states_for_batch[0].shape)
      last_hidden_states = torch.cat((last_hidden_states, last_hidden_states_for_batch[0].to(model.device)))

# extract [CLS] token hidden representation from output layer
features = last_hidden_states[:,0,:].cpu().numpy()
print(f'features shape: {features.shape}')

# Empty VRAM
del model
import gc
gc.collect()
torch.cuda.empty_cache()

Running inference with batch no. 1
torch.Size([64, 128, 768])
Running inference with batch no. 2
torch.Size([64, 128, 768])
Running inference with batch no. 3
torch.Size([64, 128, 768])
Running inference with batch no. 4
torch.Size([64, 128, 768])
Running inference with batch no. 5
torch.Size([64, 128, 768])
Running inference with batch no. 6
torch.Size([64, 128, 768])
Running inference with batch no. 7
torch.Size([64, 128, 768])
Running inference with batch no. 8
torch.Size([64, 128, 768])
Running inference with batch no. 9
torch.Size([64, 128, 768])
Running inference with batch no. 10
torch.Size([64, 128, 768])
Running inference with batch no. 11
torch.Size([64, 128, 768])
Running inference with batch no. 12
torch.Size([64, 128, 768])
Running inference with batch no. 13
torch.Size([64, 128, 768])
Running inference with batch no. 14
torch.Size([64, 128, 768])
Running inference with batch no. 15
torch.Size([64, 128, 768])
Running inference with batch no. 16
torch.Size([64, 128, 768])
R

In [56]:
train_features, test_features = features[:train.shape[0]], features[train.shape[0]:]
print(f'train features: {len(train_features)}')
print(f'test features: {len(test_features)}')

train features: 4464
test features: 1210


In [60]:
train_labels, test_labels = train["label"].str.split(",").to_numpy(), test["label"].str.split(",").to_numpy()
print(f'train labels: {len(train_labels)}')
print(f'test labels: {len(test_labels)}')
print(list(train_labels))

labels_with_duplicates = np.hstack(np.concatenate((train_labels, test_labels.flatten()), axis=None))
labels = [list(set(labels_with_duplicates))]
print(f'qty of labels: {len(labels[0])}')

train labels: 4464
test labels: 1210
[['Slogans'], ['Black-and-White_Fallacy'], ['Slogans'], ['Loaded_Language'], ['Flag-Waving'], ['Name_Calling_Labeling'], ['Loaded_Language'], ['Loaded_Language'], ['Name_Calling_Labeling'], ['Loaded_Language'], ['Name_Calling_Labeling'], ['Whataboutism'], ['Causal_Oversimplification'], ['Causal_Oversimplification'], ['Causal_Oversimplification'], ['Loaded_Language'], ['Slogans'], ['Loaded_Language'], ['Loaded_Language'], ['Name_Calling_Labeling'], ['Exaggeration_Minimisation'], ['Exaggeration_Minimisation'], ['Name_Calling_Labeling'], ['Loaded_Language'], ['Loaded_Language'], ['Name_Calling_Labeling'], ['Loaded_Language'], ['Doubt'], ['Doubt'], ['Doubt'], ['Doubt'], ['Doubt'], ['Appeal_to_Authority'], ['Loaded_Language'], ['Causal_Oversimplification'], ['Name_Calling_Labeling'], ['Loaded_Language'], ['Loaded_Language'], ['Loaded_Language'], ['Loaded_Language'], ['Loaded_Language'], ['Loaded_Language'], ['Name_Calling_Labeling'], ['Exaggeration_Minim

In [61]:
mlb = MultiLabelBinarizer()
train_labels_binarized = mlb.fit(labels).transform(train_labels)
test_labels_binarized = mlb.transform(test_labels)
print(f'qty labels train: {len(train_labels_binarized[0])}')
print(f'qty labels test: {len(test_labels_binarized[0])}')

qty labels train: 18
qty labels test: 18


In [62]:
ff = MLPClassifier(
    random_state=1,
    max_iter=400,
    alpha=0.001,
    shuffle=True,
    early_stopping=True,
    verbose=True
).fit(train_features, train_labels_binarized)

Iteration 1, loss = 5.56607108
Validation score: 0.000000
Iteration 2, loss = 3.08054631
Validation score: 0.156600
Iteration 3, loss = 2.95058886
Validation score: 0.029083
Iteration 4, loss = 2.89283231
Validation score: 0.230425
Iteration 5, loss = 2.85608365
Validation score: 0.210291
Iteration 6, loss = 2.82725663
Validation score: 0.194631
Iteration 7, loss = 2.79752638
Validation score: 0.098434
Iteration 8, loss = 2.78026029
Validation score: 0.051454
Iteration 9, loss = 2.76176941
Validation score: 0.192394
Iteration 10, loss = 2.73816757
Validation score: 0.223714
Iteration 11, loss = 2.72037510
Validation score: 0.217002
Iteration 12, loss = 2.70746359
Validation score: 0.071588
Iteration 13, loss = 2.69523360
Validation score: 0.136465
Iteration 14, loss = 2.67749754
Validation score: 0.190157
Iteration 15, loss = 2.66685894
Validation score: 0.230425
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


In [63]:
test_predicted_labels_binarized = ff.predict(test_features)
micro_f1 = f1_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
acc = accuracy_score(test_labels_binarized, test_predicted_labels_binarized)
prec = precision_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
rec = recall_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
print(f'micro-f1: {micro_f1}')
print(f'accuracy: {acc}')
print(f'micro-precision: {prec}')
print(f'micro-recall: {rec}')

micro-f1: 0.2779321845469705
accuracy: 0.20165289256198346
micro-precision: 0.46296296296296297
micro-recall: 0.19857029388403494


In [64]:
cf_mtx = multilabel_confusion_matrix(test_labels_binarized, test_predicted_labels_binarized)
cf_mtx.shape

(18, 2, 2)

In [65]:
cf_mtx

array([[[1161,    0],
        [  49,    0]],

       [[1082,    0],
        [ 128,    0]],

       [[1206,    0],
        [   4,    0]],

       [[1185,    0],
        [  25,    0]],

       [[1179,    0],
        [  31,    0]],

       [[1139,    0],
        [  71,    0]],

       [[1125,    0],
        [  85,    0]],

       [[1132,    0],
        [  78,    0]],

       [[ 525,  290],
        [ 146,  249]],

       [[1031,    0],
        [ 178,    1]],

       [[1204,    0],
        [   6,    0]],

       [[1198,    0],
        [  12,    0]],

       [[1198,    0],
        [  12,    0]],

       [[1097,    0],
        [ 113,    0]],

       [[1176,    0],
        [  34,    0]],

       [[1208,    0],
        [   2,    0]],

       [[1194,    0],
        [  16,    0]],

       [[1191,    0],
        [  19,    0]]])