# Przetwarzanie języka naturalnego – lab6
## Mateusz Kocot

In [140]:
from datasets import load_dataset

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.naive_bayes import GaussianNB

import fasttext

In [6]:
dataset_1 = load_dataset('poleval2019_cyberbullying', 'task01')
dataset_2 = load_dataset('poleval2019_cyberbullying', 'task02')

Found cached dataset poleval2019_cyberbullying (/home/matix/.cache/huggingface/datasets/poleval2019_cyberbullying/task01/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450)
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 553.12it/s]
Found cached dataset poleval2019_cyberbullying (/home/matix/.cache/huggingface/datasets/poleval2019_cyberbullying/task02/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450)
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 492.17it/s]


In [7]:
x_train_1, y_train_1 = dataset_1['train']['text'], dataset_1['train']['label']
x_test_1, y_test_1 = dataset_1['test']['text'], dataset_1['test']['label']
x_1 = x_train_1 + x_test_1

len(x_1), len(x_train_1), len(x_test_1)

(11041, 10041, 1000)

In [8]:
x_train_2, y_train_2 = dataset_2['train']['text'], dataset_2['train']['label']
x_test_2, y_test_2 = dataset_2['test']['text'], dataset_2['test']['label']
x_2 = x_train_2 + x_test_2

len(x_2), len(x_train_2), len(x_test_2)

(11041, 10041, 1000)

In [104]:
def save_fasttext_file(name, X, Y):
    with open('dataset/' + name + '.txt', 'w', encoding='utf8') as file:
        file.writelines(f'__label__{y} {x}\n' for x, y in zip(X, Y))
        
save_fasttext_file('train_01', x_train_1, y_train_1)
save_fasttext_file('train_02', x_train_2, y_train_2)

In [141]:
def print_scores(y_test, y_pred):
    print(f"Accuracy: {accuracy_score(y_test, y_pred):0.2f}")
    print(f"      f1: {f1_score(y_test, y_pred, average='weighted'):0.2f}") # Default weight doesn't work for more than two labels (task 02)
    print(f"Macro f1: {f1_score(y_test, y_pred, average='macro'):0.2f}")
    print(f"Micro f1: {f1_score(y_test, y_pred, average='micro'):0.2f}")
    print(f"     MCC: {matthews_corrcoef(y_test, y_pred):0.2f}")

# Task 2

## Naive Bayes

In [142]:
def test_naive_bayes(x, x_train, x_test, y_train):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(x)
    X_train, X_test = vectorizer.transform(x_train).toarray(), vectorizer.transform(x_test).toarray()
    
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    
    y_pred = gnb.predict(X_test)
    return y_pred

### Task 01

In [143]:
y_pred = test_naive_bayes(x_1, x_train_1, x_test_1, y_train_1)
print_scores(y_test_1, y_pred)

Accuracy: 0.78
      f1: 0.79
Macro f1: 0.57
Micro f1: 0.78
     MCC: 0.14


### Task 02

In [144]:
y_pred = test_naive_bayes(x_2, x_train_2, x_test_2, y_train_2)
print_scores(y_test_2, y_pred)

Accuracy: 0.79
      f1: 0.79
Macro f1: 0.40
Micro f1: 0.79
     MCC: 0.13


## Fasttext

In [151]:
def test_fasttext(task_no, x_test):
    train_file = f'dataset/train_{task_no}.txt'
    classifier = fasttext.train_supervised(input=train_file)
    
    y_pred = np.array([int(classifier.predict(x)[0][0][-1]) for x in x_test])
    return y_pred

### Task 01

In [152]:
y_pred = test_fasttext('01', x_test_1)
print_scores(y_test_1, y_pred)

Accuracy: 0.87
      f1: 0.83
Macro f1: 0.54
Micro f1: 0.87
     MCC: 0.23


Read 0M words
Number of words:  31486
Number of labels: 2
Progress: 100.0% words/sec/thread: 1027024 lr:  0.000000 avg.loss:  0.189032 ETA:   0h 0m 0s


### Task 02

In [153]:
y_pred = test_fasttext('02', x_test_2)
print_scores(y_test_2, y_pred)

Accuracy: 0.86
      f1: 0.80
Macro f1: 0.31
Micro f1: 0.86
     MCC: 0.05


Read 0M words
Number of words:  31486
Number of labels: 3
Progress: 100.0% words/sec/thread: 1025891 lr:  0.000000 avg.loss:  0.245931 ETA:   0h 0m 0s


## Transformer