In [1]:
!pip install transformers -q

[K     |████████████████████████████████| 3.4 MB 10.6 MB/s 
[K     |████████████████████████████████| 61 kB 266 kB/s 
[K     |████████████████████████████████| 895 kB 41.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 11.2 MB/s 
[K     |████████████████████████████████| 596 kB 36.7 MB/s 
[?25h

In [2]:
import tensorflow as tf
import pandas as pd
import os
import shutil
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from sklearn.model_selection import train_test_split

In [3]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [4]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


## Attach Drive and load the data

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [6]:
path = '/content/drive/MyDrive/tweets_labeled_bi.csv'   
tweets_txt = pd.read_csv(path, delimiter = ",", encoding='utf-8')
tweets_txt.head()

Unnamed: 0.1,Unnamed: 0,text,clean_text,cleaned_text,sentiment,sentiments_val
0,0,"kiedy #coronavirus dotrze do polski, napotka o...","['coronavirus', 'dotrze', 'polski', 'napotka',...",coronavirus dotrze polski napotka opór znakomi...,0,negative
1,1,kolejni polacy chcą wrócić z wuhan https://t.c...,"['kolejni', 'polacy', 'chcą', 'wrócić', 'wuhan']",kolejni polacy chcą wrócić wuhan,0,negative
2,2,a tymczasem w wuhan odcięto od reszty świata o...,"['tymczasem', 'wuhan', 'odcięto', 'reszty', 'ś...",tymczasem wuhan odcięto reszty świata obszar p...,1,positive
3,3,@patrykwachowiec @kancelariasejmu nie zdziwię ...,"['zdziwię', 'powiedzą', 'listy', 'pojechały', ...",zdziwię powiedzą listy pojechały chin wuhan od...,0,negative
4,4,".@msz_rp: 19 polaków chce wrócić z wuhan, w am...","['rp', 'polaków', 'wrócić', 'wuhan', 'ambasadz...",rp polaków wrócić wuhan ambasadzie rp pekinie ...,0,negative


## Split data into train and test

In [7]:
train, test = train_test_split(tweets_txt, test_size = 0.05, random_state =42) 
print(len(train),len(test))

460832 24255


In [8]:
train = train[['cleaned_text','sentiment']]
test = test[['cleaned_text','sentiment']]


In [9]:
#train['sentiment']= train['sentiment'].astype(str)
train['cleaned_text']= train['cleaned_text'].astype(str)

#test['sentiment']= test['sentiment'].astype(str)
test['cleaned_text']= test['cleaned_text'].astype(str)
#test['id']= test['id'].astype(str)


## Creating input sequences for BERT

In [10]:
# zamieniamy train i test na obiekt InputExample
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None,
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None,
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

In [11]:
 # tokenizujemy obiekty InputExample, który następnie przerabiany na obiekty, które są gotowe wejść do modelu

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []

    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            return_token_type_ids=True,
            return_attention_mask=True,
            padding=True,
            truncation=True
        )

        # ze słownika wyciągamy interesujące nas obiekty i wrzucamy do holdera (listy)
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    # tworzymy generator do wyciagania danych z holdera
    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator( # tworzy datasets z elementów generatora
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'cleaned_text'
LABEL_COLUMN = 'sentiment'

In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).padded_batch(32).repeat(2)
#train_data = train_data.batch(32)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.padded_batch(32)

## BERT Training

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
    )

model.fit(train_data, epochs=2, validation_data=validation_data, steps_per_epoch= 15000)
model.save_weights('/content/drive/MyDrive/bert_weights.h5')

Epoch 1/2
Epoch 2/2


In [13]:
#Load
model.load_weights('/content/drive/MyDrive/bert_weights.h5')


## Test the model

In [30]:
pred_sentences = ['Pandemia to najgorsze co mnie w życiu spotkało, nie wychodzę z domu, jestem w izolacji cały czas',
                  'Mam pozytywny wynik testu :(']


In [31]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)

labels = ['Negative','Positive']

# na podstawie większej wartości przypisujemy label
label = tf.argmax(tf_predictions, axis=1) 
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

Pandemia to najgorsze co mnie w życiu spotkało, nie wychodzę z domu, jestem w izolacji cały czas : 
 Negative
Mam pozytywny wynik testu :( : 
 Positive
