In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import BertTokenizer, BertConfig, TFBertForSequenceClassification
import tensorflow as tf

from sklearn.model_selection import train_test_split
from codecarbon import EmissionsTracker


pd.options.display.max_colwidth = None
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


Metal device set to: Apple M1 Pro


2022-07-21 11:48:25.102128: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-21 11:48:25.102230: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
#importint datasets

df_train = pd.read_csv('data/sentiment/train.txt', header = None, sep =';', names = ['Input','Sentiment'], encoding='utf-8')
df_test = pd.read_csv('data/sentiment/test.txt', header = None, sep =';', names = ['Input','Sentiment'], encoding='utf-8')

In [3]:
df_train.head()
print(df_train.shape)
df_train.head()

(16000, 2)


Unnamed: 0,Input,Sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love
4,i am feeling grouchy,anger


In [4]:
df_test.head()
print(df_test.shape)
df_test.head()

(2000, 2)


Unnamed: 0,Input,Sentiment
0,im feeling rather rotten so im not very ambitious right now,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i don t ever want her to feel like i m ashamed with her,sadness
3,i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived,joy
4,i was feeling a little vain when i did this one,sadness


In [5]:
df_train.Sentiment.value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: Sentiment, dtype: int64

In [6]:
#Encoding sentiments
encoded_dict = {'anger':0,'fear':1, 'joy':2, 'love':3, 'sadness':4, 'surprise':5}
df_train['Sentiment'] = df_train.Sentiment.map(encoded_dict)
df_test['Sentiment'] = df_test.Sentiment.map(encoded_dict)

In [7]:
df_train.head()

Unnamed: 0,Input,Sentiment
0,i didnt feel humiliated,4
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,4
2,im grabbing a minute to post i feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,3
4,i am feeling grouchy,0


In [8]:
df_test.head()

Unnamed: 0,Input,Sentiment
0,im feeling rather rotten so im not very ambitious right now,4
1,im updating my blog because i feel shitty,4
2,i never make her separate from me because i don t ever want her to feel like i m ashamed with her,4
3,i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived,2
4,i was feeling a little vain when i did this one,4


In [9]:
X_train = df_train.Input
y_train = df_train.Sentiment

X_test = df_test.Input
y_test = df_test.Sentiment

## Preparing DATA

In [10]:
#Tokenize

nombre_modelo = 'bert-base-multilingual-uncased'

tokenizer = BertTokenizer.from_pretrained(nombre_modelo)
train_encodings = tokenizer(X_train.to_list(), truncation=True, padding=True, return_tensors="tf")

In [11]:
MAX_SEQUENCE_LENGTH=train_encodings['input_ids'].shape[1]
test_encodings = tokenizer(X_test.to_list(), truncation=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf")
test_final_encondings = tokenizer(X_test.to_list(), truncation=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf")

In [12]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

final_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_final_encondings),
))


## Fine Tunnign with BERT

In [13]:
N_SENTIMENT = len(encoded_dict)

#definimos modelo de clasificación
config = BertConfig.from_pretrained(nombre_modelo, hidden_dropout_prob=0.1, num_labels=N_SENTIMENT)

model = TFBertForSequenceClassification.from_pretrained(nombre_modelo, config=config)
model.bert.trainable = False #Freeze BERT layers

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  167356416 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  4614      
                                                                 
Total params: 167,361,030
Trainable params: 4,614
Non-trainable params: 167,356,416
_________________________________________________________________


In [None]:
#main loop
batch_size=32

history=model.fit(train_dataset.batch(batch_size), epochs=2, batch_size=batch_size, validation_data=test_dataset.batch(batch_size))

Epoch 1/2


2022-07-21 11:48:52.628966: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-07-21 11:48:52.629823: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


