In [124]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from emoji import demojize
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, multilabel_confusion_matrix

In [125]:
%pwd

'/home/lfmatosmelo/Projetos/bambas/bertweet'

In [126]:
train = pd.read_csv("./dataset/ptc_preproc_train.csv")
test = pd.read_csv("./dataset/ptc_preproc_test.csv")
train

Unnamed: 0.1,Unnamed: 0,text,label,category
0,0,Stop Islamization of America.\t,['Slogans'],Call
1,1,We condemn all those whose behaviours and view...,['Black-and-White_Fallacy'],
2,2,Defeat Jihad`,['Slogans'],Call
3,3,the nation that gave the world the Magna Carta...,['Loaded_Language'],Manipulative_wording
4,4,The UK should never become a stage for inflamm...,['Flag-Waving'],Justification
...,...,...,...,...
4945,4945,sham,['Loaded_Language'],Manipulative_wording
4946,4946,evidently completely false,['Name_Calling_Labeling'],Attack_on_reputation
4947,4947,Russian collaborators,['Name_Calling_Labeling'],Attack_on_reputation
4948,4948,We have previously shown that the Guardian eve...,['Reductio_ad_hitlerum'],


In [127]:
train[train["text"].isnull()]

Unnamed: 0.1,Unnamed: 0,text,label,category
3348,3348,,['Loaded_Language'],Manipulative_wording


In [128]:
train.dropna(subset=["text", "label"], inplace=True)

In [129]:
train.duplicated(subset=["text"]).sum()

485

In [130]:
train.drop_duplicates(subset=["text"], inplace=True)

In [131]:
test

Unnamed: 0.1,Unnamed: 0,text,label,category
0,0,The next transmission could be more pronounced...,['Appeal_to_Authority'],Justification
1,1,when (the plague) comes again it starts from m...,['Appeal_to_Authority'],Justification
2,2,appeared,['Doubt'],Attack_on_reputation
3,3,"a very, very different",['Repetition'],Manipulative_wording
4,4,He also pointed to the presence of the pneumon...,['Appeal_to_fear-prejudice'],Justification
...,...,...,...,...
1376,1376,Trump-hating Republican,['Name_Calling_Labeling'],Attack_on_reputation
1377,1377,grave hardship,['Loaded_Language'],Manipulative_wording
1378,1378,unbelievably rude,['Name_Calling_Labeling'],Attack_on_reputation
1379,1379,wonderful woman,['Name_Calling_Labeling'],Attack_on_reputation


In [132]:
test[test["text"].isnull()]

Unnamed: 0.1,Unnamed: 0,text,label,category


In [133]:
test.duplicated(subset=["text"]).sum()

171

In [134]:
test.drop_duplicates(subset=["text"], inplace=True)

In [135]:
train["label"].value_counts()

['Loaded_Language']                                                     1595
['Name_Calling_Labeling']                                                824
['Doubt']                                                                408
['Exaggeration_Minimisation']                                            349
['Repetition']                                                           230
['Causal_Oversimplification']                                            162
['Appeal_to_fear-prejudice']                                             160
['Flag-Waving']                                                          144
['Slogans']                                                               95
['Black-and-White_Fallacy']                                               91
['Appeal_to_Authority']                                                   86
['Thought-terminating_Cliches']                                           57
['Whataboutism']                                                          52

In [136]:
train["label"].describe()

count                    4464
unique                     60
top       ['Loaded_Language']
freq                     1595
Name: label, dtype: object

In [137]:
test["label"].value_counts()

['Loaded_Language']                                                                               383
['Name_Calling_Labeling']                                                                         177
['Appeal_to_fear-prejudice']                                                                      110
['Repetition']                                                                                     99
['Exaggeration_Minimisation']                                                                      71
['Flag-Waving']                                                                                    70
['Doubt']                                                                                          69
['Appeal_to_Authority']                                                                            36
['Causal_Oversimplification']                                                                      30
['Slogans']                                                                       

In [138]:
test["label"].describe()

count                    1210
unique                     41
top       ['Loaded_Language']
freq                      383
Name: label, dtype: object

In [139]:
train["text"].str.len().describe()

count    4464.000000
mean       52.327285
std        66.162997
min         3.000000
25%        16.000000
50%        27.000000
75%        60.000000
max       799.000000
Name: text, dtype: float64

In [140]:
train["tokens_per_text"] = train["text"].str.split().str.len()
train["tokens_per_text"].describe()

count    4464.000000
mean        8.622312
std        11.305836
min         1.000000
25%         2.000000
50%         4.000000
75%        10.000000
max       141.000000
Name: tokens_per_text, dtype: float64

In [141]:
test["tokens_per_text"] = test["text"].str.split().str.len()
test["tokens_per_text"].describe()

count    1210.000000
mean        7.338843
std         9.341914
min         1.000000
25%         2.000000
50%         4.000000
75%         9.000000
max        80.000000
Name: tokens_per_text, dtype: float64

In [142]:
train["tokens_per_text"] = train["text"].str.split().str.len()
train["tokens_per_text"].describe()

count    4464.000000
mean        8.622312
std        11.305836
min         1.000000
25%         2.000000
50%         4.000000
75%        10.000000
max       141.000000
Name: tokens_per_text, dtype: float64

In [143]:
# from https://github.com/VinAIResearch/BERTweet/blob/master/TweetNormalizer.py
twitter_tokenizer = TweetTokenizer()

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalizeTweet(tweet):
    tokens = twitter_tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())

In [144]:
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, normalization=True)
model = AutoModel.from_pretrained(model_name)

In [151]:
data = pd.concat((train["text"], test["text"]))
tokens = data.apply((lambda x: tokenizer.encode(
    x,
    # add_special_tokens=True,
    truncation=True,
    # padding=True,
    max_length=128
)))
tokens

0                   [0, 1267, 32789, 6680, 15, 885, 4, 2]
1       [0, 134, 19715, 48, 268, 3430, 45973, 5094, 13...
2                              [0, 40212, 41645, 1654, 2]
3       [0, 6, 2580, 25, 867, 6, 220, 6, 4658, 880, 15...
4       [0, 47, 922, 151, 143, 870, 11, 1719, 19, 4982...
                              ...                        
1375                    [0, 11, 200, 847, 14270, 4312, 2]
1376                            [0, 46004, 4542, 4265, 2]
1377                                  [0, 9055, 43899, 2]
1378                                  [0, 26730, 2915, 2]
1379                                    [0, 1576, 750, 2]
Name: text, Length: 5674, dtype: object

In [154]:
padded = np.array([token+[0]*(128-len(token)) for token in tokens.values])
np.array(padded).shape

(5674, 128)

In [155]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(5674, 128)

In [None]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

# extract [CLS] token hidden representation from output layer
features = last_hidden_states[0][:,0,:].numpy()
print(f'features shape: {features.shape}')
train_features, test_features = features[:train.ndim], features[train.ndim:]
print(f'train features: {len(train_features)}')
print(f'test features: {len(test_features)}')

In [None]:
train_labels, test_labels = train["label"].to_numpy(), test["label"].to_numpy()

mlb = MultiLabelBinarizer()
train_labels_binarized = mlb.fit_transform(train_labels)
test_labels_binarized = mlb.transform(test_labels)

ff = MLPClassifier(
    random_state=1,
    max_iter=400,
    alpha=0.001,
    shuffle=True,
    early_stopping=True,
    verbose=True
).fit(train_features, train_labels_binarized)

In [None]:
test_predicted_labels_binarized = ff.predict(test_features)
micro_f1 = f1_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
acc = accuracy_score(test_labels_binarized, test_predicted_labels_binarized)
prec = precision_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
rec = recall_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
print(f'micro-f1: {micro_f1}')
print(f'accuracy: {acc}')
print(f'micro-precision: {prec}')
print(f'micro-recall: {rec}')

In [None]:
cf_mtx = multilabel_confusion_matrix(test_labels_binarized, test_predicted_labels_binarized)
cf_mtx