In [14]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from emoji import demojize
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, multilabel_confusion_matrix

In [15]:
train = pd.read_csv("./dataset/ptc_preproc_train.csv", sep=";").dropna(subset=["text", "label"])[["text", "label"]]
train = train.drop_duplicates(subset=["text"])
test = pd.read_csv("./dataset/ptc_preproc_test.csv", sep=";").dropna(subset=["text", "label"])[["text", "label"]]
test = test.drop_duplicates(subset=["text"])

# This pd.DataFrame should be empty
pd.concat([test[test.text.isnull()],train[train.text.isnull()]])

Unnamed: 0,text,label


In [16]:
len(train), len(test), len(train) + len(test)

(4464, 1210, 5674)

In [17]:
pd.concat([train, test])[4464:].count()

text     1210
label    1210
dtype: int64

In [18]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, normalization=True)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
data = pd.concat((train["text"], test["text"]))
tokens = data.apply((lambda x: tokenizer.encode(
    x,
    # add_special_tokens=True,
    truncation=True,
    # padding=True,
    max_length=128
)))
tokens

0               [0, 42284, 3726, 47691, 111, 21629, 5, 2]
1       [0, 1401, 158, 4545, 19, 756, 8382, 124901, 22...
2                    [0, 262, 88981, 3291, 8408, 1135, 2]
3       [0, 70, 64857, 450, 26038, 70, 8999, 70, 14069...
4       [0, 581, 17274, 5608, 8306, 24209, 10, 36541, ...
                              ...                        
1375    [0, 10, 6782, 23972, 62, 25965, 674, 91375, 53...
1376                   [0, 5879, 9, 2943, 214, 131161, 2]
1377                           [0, 17202, 7941, 16070, 2]
1378                [0, 51, 14473, 5874, 38526, 83024, 2]
1379                                 [0, 58867, 46667, 2]
Name: text, Length: 5674, dtype: object

In [20]:
padded = np.array([token+[0]*(128-len(token)) for token in tokens.values])
np.array(padded).shape

(5674, 128)

In [21]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(5674, 128)

In [22]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

# extract [CLS] token hidden representation from output layer
features = last_hidden_states[0][:,0,:].numpy()
print(f'features shape: {features.shape}')


features shape: (5674, 768)


In [23]:
train_features, test_features = features[:-len(test)], features[len(train):]
print(f'train features: {len(train_features)}')
print(f'test features: {len(test_features)}')

train features: 4464
test features: 1210


In [24]:
train_labels, test_labels = train["label"].to_numpy(), test["label"].to_numpy()

mlb = MultiLabelBinarizer()
train_labels_binarized = mlb.fit_transform(train_labels)
test_labels_binarized = mlb.fit_transform(test_labels)

ff = MLPClassifier(
    random_state=1,
    max_iter=400,
    alpha=0.001,
    shuffle=True,
    early_stopping=True,
    verbose=True
).fit(train_features, train_labels_binarized)

Iteration 1, loss = 23.42109472
Validation score: 0.004474
Iteration 2, loss = 16.06559124
Validation score: 0.013423
Iteration 3, loss = 15.32512769
Validation score: 0.038031
Iteration 4, loss = 14.99810445
Validation score: 0.174497
Iteration 5, loss = 14.71179513
Validation score: 0.029083
Iteration 6, loss = 14.45220230
Validation score: 0.129754
Iteration 7, loss = 14.23660908
Validation score: 0.192394
Iteration 8, loss = 14.06472775
Validation score: 0.149888
Iteration 9, loss = 13.89430295
Validation score: 0.129754
Iteration 10, loss = 13.77924536
Validation score: 0.143177
Iteration 11, loss = 13.64704660
Validation score: 0.174497
Iteration 12, loss = 13.53689579
Validation score: 0.161074
Iteration 13, loss = 13.43610544
Validation score: 0.158837
Iteration 14, loss = 13.33087999
Validation score: 0.250559
Iteration 15, loss = 13.27238318
Validation score: 0.210291
Iteration 16, loss = 13.18447943
Validation score: 0.205817
Iteration 17, loss = 13.10518257
Validation score

In [25]:
test_predicted_labels_binarized = ff.predict(test_features)
micro_f1 = f1_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
acc = accuracy_score(test_labels_binarized, test_predicted_labels_binarized)
prec = precision_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
rec = recall_score(test_labels_binarized, test_predicted_labels_binarized, average="micro")
print(f'micro-f1: {micro_f1}')
print(f'accuracy: {acc}')
print(f'micro-precision: {prec}')
print(f'micro-recall: {rec}')

micro-f1: 0.6718671783747706
accuracy: 0.24049586776859505
micro-precision: 0.753885040254634
micro-recall: 0.6059443190368698


In [26]:
cf_mtx = multilabel_confusion_matrix(test_labels_binarized, test_predicted_labels_binarized)
cf_mtx

array([[[1165,    0],
        [  45,    0]],

       [[ 971,    0],
        [ 239,    0]],

       [[1037,    0],
        [ 173,    0]],

       [[1181,    0],
        [  29,    0]],

       [[ 960,   18],
        [ 198,   34]],

       [[1139,    0],
        [  71,    0]],

       [[1125,    0],
        [  85,    0]],

       [[1107,    0],
        [ 103,    0]],

       [[1198,    0],
        [  12,    0]],

       [[1204,    0],
        [   6,    0]],

       [[ 382,  254],
        [  55,  519]],

       [[1123,    0],
        [  87,    0]],

       [[1002,   29],
        [ 134,   45]],

       [[1173,    0],
        [  37,    0]],

       [[1073,    0],
        [ 137,    0]],

       [[1174,    0],
        [  36,    0]],

       [[1194,    0],
        [  16,    0]],

       [[1204,    0],
        [   6,    0]],

       [[1088,    0],
        [ 122,    0]],

       [[  11,  282],
        [   5,  912]],

       [[   0,  180],
        [   0, 1030]],

       [[ 923,   12],
        [ 25