# Imports

In [1]:
import nltk                             
from nltk.corpus import twitter_samples
import pandas as pd

In [2]:
import tensorflow as tf
from tensorflow import keras

In [3]:
from transformers import InputExample, InputFeatures
from transformers import BertTokenizer, TFBertForSequenceClassification


In [4]:
from sklearn.model_selection import train_test_split

In [6]:
data = pd.read_feather("../data/movie_reviews_4k.feather")

In [7]:
data.shape

(4000, 2)

## Train test split

In [8]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=0)

In [9]:
data_train.reset_index(inplace=True, drop=True)
data_test.reset_index(inplace=True, drop=True)

## Load BERT model and Tokenizer

In [10]:

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Prepare inputs

In [11]:
def convert_data_to_examples(data, data_col, label_col): 
    examples= data.apply(lambda x: InputExample(
        guid=None, 
        text_a = x[data_col], 
        text_b = None,
        label = x[label_col]), axis = 1)      
    
    return examples

In [12]:
examples_train = convert_data_to_examples(data_train, "text", "label")
examples_test = convert_data_to_examples(data_test, "text", "label")

In [13]:
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )



In [14]:
len(examples_train)

3200

In [15]:
dataset_train = convert_examples_to_tf_dataset(examples_train, tokenizer, max_length=128)
dataset_test = convert_examples_to_tf_dataset(examples_test, tokenizer, max_length=128)



In [16]:
dataset_train = dataset_train.shuffle(100).batch(32)

In [17]:
dataset_test = dataset_test.shuffle(100).batch(32)

In [None]:
%%time

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
history = model.fit(dataset_train, epochs=5, verbose=1,  validation_data=dataset_test)

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
     46/Unknown - 342s 7s/step - loss: 0.5596 - accuracy: 0.7357

In [None]:
history.history

In [None]:
pred_sentences = [df_p.text[50],df_n.text[50]]

In [None]:
pred_sentences  =[ "The most possitive tweet ever", "Worst day ever :)", "I am quiting twitter :(:())"]

In [None]:
def classify_sentences(sentences):
    sententes_tokenized = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
    model_outputs = model(sententes_tokenized)
    sentiment_proba =  tf.nn.softmax(model_outputs[0], axis=-1)
    sentiment_class = tf.argmax(sentiment_proba, axis=1)
    sentiment_labels = [ 'Negative', 'Positive']
    sentiments = [sentiment_labels[i] for i in sentiment_class ]
    
    return sentiment_proba, sentiments

In [None]:
classify_sentences(pred_sentences )

In [None]:
data_train.label.mean()