# BERT

In [20]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, multilabel_confusion_matrix, classification_report

import time

import warnings
warnings.filterwarnings('ignore')

## Učitavanje podataka

In [21]:
dataset_path = 'full_dataset/full_dataset.csv'
data = pd.read_csv(dataset_path)
data

Unnamed: 0,emotion,original_text,text_tokens,preprocessed_text
0,happiness,"During the period of falling in love, each tim...","['period', 'falling', 'love', 'time', 'met', '...",period falling love time met especially met lo...
1,fear,When I was involved in a traffic accident.,"['involved', 'traffic', 'accident']",involved traffic accident
2,anger,When I was driving home after several days of...,"['driving', 'home', 'several', 'day', 'hard', ...",driving home several day hard work motorist ah...
3,sadness,When I lost the person who meant the most to me.,"['lost', 'person', 'meant']",lost person meant
4,disgust,The time I knocked a deer down - the sight of ...,"['time', 'knocked', 'deer', 'sight', 'animal',...",time knocked deer sight animal injury helpless...
...,...,...,...,...
9882,shame,He gets real humiliated and has to leave .,"['get', 'real', 'humiliated', 'leave']",get real humiliated leave
9883,shame,They aimed for higher status jobs and felt hum...,"['aimed', 'higher', 'status', 'job', 'felt', '...",aimed higher status job felt humiliated unempl...
9884,shame,He cursed his lack of self-control ; he knew t...,"['cursed', 'lack', 'selfcontrol', 'knew', 'old...",cursed lack selfcontrol knew old biddy seen fe...
9885,shame,Sometimes I've thought I 'll never forget wha...,"['sometimes', 'ive', 'thought', 'never', 'forg...",sometimes ive thought never forget happened co...


In [22]:
class_names = data['emotion'].unique()
class_names

array(['happiness', 'fear', 'anger', 'sadness', 'disgust', 'shame',
       'guilt', 'surprise'], dtype=object)

In [23]:
num_classes = len(class_names)
num_classes

8

## Priprema podataka

In [24]:
X = data['preprocessed_text'] # ili staviti original_text
Y = data['emotion']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=357)

print("Training set size = ", X_train.shape[0])
print("Test set size = ", X_test.shape[0])

Training set size =  7909
Test set size =  1978


In [25]:
encoding = {
    'happiness': 0,
    'fear': 1,
    'anger': 2,
    'sadness': 3,
    'disgust': 4,
    'shame': 5,
    'guilt': 6,
    'surprise': 7
}

y_train = [encoding[x] for x in Y_train]
y_test = [encoding[x] for x in Y_test]

Podaci se moraju posebno predprocesuirati da bi se mogli koristiti u BERT-u.

In [28]:
X_train = X_train.tolist()
X_test = X_test.tolist()

In [29]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


## Treniranje

In [30]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


In [31]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [32]:
learning_rate = 2e-5
epochs = 3
learner.fit_onecycle(learning_rate, epochs)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3


KeyboardInterrupt: 

In [None]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

## Testiranje

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

In [None]:
message = 'I like apples'

start_time = time.time() 
prediction = predictor.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))