In [67]:
!pip install sentence-transformers



In [68]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, multilabel_confusion_matrix


In [69]:
train = pd.read_csv(f'ptc_preproc_train.csv', sep=";")
test = pd.read_csv(f'ptc_preproc_test.csv', sep=";")


In [70]:
train.head()

Unnamed: 0.1,Unnamed: 0,text,label,category
0,0,Stop Islamization of America.\t,Slogans,Call
1,1,We condemn all those whose behaviours and view...,Black-and-White_Fallacy,
2,2,Defeat Jihad`,Slogans,Call
3,3,the nation that gave the world the Magna Carta...,Loaded_Language,Manipulative_wording
4,4,The UK should never become a stage for inflamm...,Flag-Waving,Justification


In [71]:
train.dropna(subset=["text", "label"], inplace=True)


In [72]:
train.drop_duplicates(subset=["text"], inplace=True)


In [73]:
test.drop_duplicates(subset=["text"], inplace=True)


In [74]:
train["label"].value_counts()


Loaded_Language                                               1595
Name_Calling_Labeling                                          824
Doubt                                                          408
Exaggeration_Minimisation                                      349
Repetition                                                     230
Causal_Oversimplification                                      162
Appeal_to_fear-prejudice                                       160
Flag-Waving                                                    144
Slogans                                                         95
Black-and-White_Fallacy                                         91
Appeal_to_Authority                                             86
Thought-terminating_Cliches                                     57
Whataboutism                                                    52
Reductio_ad_hitlerum                                            38
Red_Herring                                                   

In [76]:
model_name = "sentence-transformers/stsb-xlm-r-multilingual"


In [77]:
import torch


In [78]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [79]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_name).to(device)


.gitattributes:   0%|          | 0.00/574 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/709 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [80]:
device

'cuda'

In [81]:
train.dropna()
test.dropna()

Unnamed: 0.1,Unnamed: 0,text,label,category
0,0,The next transmission could be more pronounced...,Appeal_to_Authority,Justification
1,1,when (the plague) comes again it starts from m...,Appeal_to_Authority,Justification
2,2,appeared,Doubt,Attack_on_reputation
3,3,"a very, very different",Repetition,Manipulative_wording
4,4,He also pointed to the presence of the pneumon...,Appeal_to_fear-prejudice,Justification
...,...,...,...,...
1375,1375,a great First Amendment victory,Exaggeration_Minimisation,Manipulative_wording
1376,1376,Trump-hating Republican,Name_Calling_Labeling,Attack_on_reputation
1377,1377,grave hardship,Loaded_Language,Manipulative_wording
1378,1378,unbelievably rude,Name_Calling_Labeling,Attack_on_reputation


In [82]:
def encode(data):
  return model.encode(data)

In [83]:
train['embeddings'] = train['text'].apply(encode)
test['embeddings'] = test['text'].apply(encode)

train.head()

Unnamed: 0.1,Unnamed: 0,text,label,category,embeddings
0,0,Stop Islamization of America.\t,Slogans,Call,"[-0.6336293, 0.26229325, -0.058358945, -0.4584..."
1,1,We condemn all those whose behaviours and view...,Black-and-White_Fallacy,,"[0.26669663, 0.4394471, 0.49054775, -0.0066378..."
2,2,Defeat Jihad`,Slogans,Call,"[-0.07159691, 0.3338, 0.7196488, 0.0920275, -0..."
3,3,the nation that gave the world the Magna Carta...,Loaded_Language,Manipulative_wording,"[0.22905141, 0.75942445, 0.71893483, 0.4899607..."
4,4,The UK should never become a stage for inflamm...,Flag-Waving,Justification,"[0.2081565, 0.6515301, 1.1221488, -0.8420495, ..."


In [84]:
train_features = train['embeddings'].to_list()


In [85]:
test_features = test['embeddings'].to_list()

In [86]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(solver='lbfgs', max_iter=1000)
LR.fit(train_features,train["label"].to_numpy())


KeyboardInterrupt: ignored

In [None]:
from sklearn import metrics
predicted = LR.predict(test_features)
print("Logistic Regression Accuracy:",metrics.accuracy_score(test["label"].to_numpy(), predicted))
print("Logistic Regression Precision:",metrics.precision_score(test["label"].to_numpy(), predicted, average='macro'))
print("Logistic Regression Recall:",metrics.recall_score(test["label"].to_numpy(), predicted, average='macro'))


In [88]:
import numpy as np

In [87]:
train_labels, test_labels = train["label"].str.split(",").to_numpy(), test["label"].str.split(",").to_numpy()
print(f'train labels: {len(train_labels)}')
print(f'test labels: {len(test_labels)}')

labels_with_duplicates = np.hstack(np.concatenate((train_labels, test_labels), axis=None))
labels = [list(set(labels_with_duplicates))]
print(f'qty of labels: {len(labels[0])}')


train labels: 4464
test labels: 1210
qty of labels: 18


In [89]:
mlb = MultiLabelBinarizer()
train_labels_binarized = mlb.fit(labels).transform(train_labels)
test_labels_binarized = mlb.transform(test_labels)
# train_labels_binarized = mlb.fit_transform(train_labels)
# test_labels_binarized = mlb.fit_transform(test_labels)
print(f'qty labels train: {len(train_labels_binarized[0])}')
print(f'qty labels test: {len(test_labels_binarized[0])}')


qty labels train: 18
qty labels test: 18


In [90]:
ff = MLPClassifier(
    random_state=1,
    max_iter=400,
    alpha=0.001,
    shuffle=True,
    early_stopping=True,
    verbose=True
).fit(train_features, train_labels_binarized)


Iteration 1, loss = 6.34985831
Validation score: 0.279642
Iteration 2, loss = 2.71354466
Validation score: 0.393736
Iteration 3, loss = 2.33581964
Validation score: 0.416107
Iteration 4, loss = 2.15033950
Validation score: 0.440716
Iteration 5, loss = 2.02521039
Validation score: 0.436242
Iteration 6, loss = 1.93985134
Validation score: 0.442953
Iteration 7, loss = 1.87019901
Validation score: 0.458613
Iteration 8, loss = 1.80722490
Validation score: 0.458613
Iteration 9, loss = 1.75212706
Validation score: 0.467562
Iteration 10, loss = 1.70056188
Validation score: 0.476510
Iteration 11, loss = 1.65745259
Validation score: 0.474273
Iteration 12, loss = 1.61535963
Validation score: 0.474273
Iteration 13, loss = 1.57936581
Validation score: 0.467562
Iteration 14, loss = 1.54452065
Validation score: 0.476510
Iteration 15, loss = 1.50893121
Validation score: 0.496644
Iteration 16, loss = 1.47093313
Validation score: 0.472036
Iteration 17, loss = 1.44296792
Validation score: 0.472036
Iterat

In [91]:
test_predicted_labels_binarized = ff.predict(test_features)
micro_f1 = f1_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
acc = accuracy_score(test_labels_binarized, test_predicted_labels_binarized)
prec = precision_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
rec = recall_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
print(f'micro-f1: {micro_f1}')
print(f'accuracy: {acc}')
print(f'micro-precision: {prec}')
print(f'micro-recall: {rec}')


micro-f1: 0.5077343039126478
accuracy: 0.4024793388429752
micro-precision: 0.5942492012779552
micro-recall: 0.443208895949166


In [94]:
cf_mtx = multilabel_confusion_matrix(test_labels_binarized, test_predicted_labels_binarized)
cf_mtx.shape


(18, 2, 2)

In [95]:
cf_mtx


array([[[1160,    1],
        [  49,    0]],

       [[1044,   38],
        [  93,   35]],

       [[1206,    0],
        [   4,    0]],

       [[1173,   12],
        [  23,    2]],

       [[1158,   21],
        [  27,    4]],

       [[1118,   21],
        [  44,   27]],

       [[1081,   44],
        [  48,   37]],

       [[1113,   19],
        [  48,   30]],

       [[ 687,  128],
        [  99,  296]],

       [[ 960,   71],
        [  75,  104]],

       [[1204,    0],
        [   6,    0]],

       [[1197,    1],
        [  12,    0]],

       [[1196,    2],
        [  11,    1]],

       [[1083,   14],
        [ 104,    9]],

       [[1170,    6],
        [  22,   12]],

       [[1208,    0],
        [   2,    0]],

       [[1191,    3],
        [  16,    0]],

       [[1191,    0],
        [  18,    1]]])