In [None]:
!pip install -q spacy
!pip install -q tensorflow
!pip install -q transformers

In [None]:
%env TF_CPP_MIN_LOG_LEVEL = 3

env: TF_CPP_MIN_LOG_LEVEL=3


In [None]:
import math
import nltk
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import transformers
from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.metrics import Precision, Recall, AUC
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, LearningRateScheduler, CallbackList, ReduceLROnPlateau
from tensorflow.keras.optimizers.experimental import Adam

# **Loading Data**

In [None]:
# Run only if you are using Google Colab
from google.colab import drive

'''drive.mount('/content/drive')

import os

path = '/content/drive/MyDrive/Colab Notebooks/Cap7''''

path = './datas'

Mounted at /content/drive


In [None]:
train_data = pd.read_csv(os.path.join(path, 'dados_treino.txt'), header = None, delimiter = ';')

test_data = pd.read_csv(os.path.join(path, 'dados_teste.txt'), header = None, delimiter = ';')

In [None]:
train_data = train_data.rename(columns = {0: 'text', 1: 'sentiment'})
test_data = test_data.rename(columns = {0: 'text', 1: 'sentiment'})

In [None]:
train_data.shape

(16000, 2)

In [None]:
test_data.shape

(2000, 2)

In [None]:
train_data.head()

Unnamed: 0,text,sentiment
0,i am feeling completely overwhelmed i have two...,fear
1,i have the feeling she was amused and delighted,joy
2,i was able to help chai lifeline with your sup...,joy
3,i already feel like i fucked up though because...,anger
4,i still love my so and wish the best for him i...,sadness


In [None]:
train_data['sentiment'].value_counts()

sentiment
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [None]:
test_data['sentiment'].value_counts()

sentiment
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64

# **PreProcessing with Spacy**

In [None]:
# Download Dict
!python -m spacy download en_core_web_md -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Loading Dict
nlp_dict = spacy.load('en_core_web_md')

In [None]:
def preprocessing_text(text):
  # Processing text with Dict
  doc = nlp_dict(text)

  # Create an list with tokens
  tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop]

  # Return tokens
  return ' '.join(tokens)

In [None]:
train_data['Processed_text'] = train_data['text'].apply(preprocessing_text)

test_data['Processed_text'] = test_data['text'].apply(preprocessing_text)

In [None]:
train_data.head()

Unnamed: 0,text,sentiment,Processed_text
0,i am feeling completely overwhelmed i have two...,fear,feel completely overwhelmed strategy help feel...
1,i have the feeling she was amused and delighted,joy,feeling amuse delight
2,i was able to help chai lifeline with your sup...,joy,able help chai lifeline support encouragement ...
3,i already feel like i fucked up though because...,anger,feel like fuck not usually eat morning
4,i still love my so and wish the best for him i...,sadness,love wish good long tolerate effect bm life fa...


In [None]:
# Function to encode text into int sequences for Bert model input
def encode(texts, tokenizer, chunk_size = 256, maxlen = 512):
  # Enable truncation in tokenizer to max lenght
  tokenizer.enable_truncation(max_length = maxlen)

  # Enable padding in tokenizer
  tokenizer.enable_padding(length = maxlen)

  all_idx = []

  # Iterate over all text in 'chunk pieces'
  for i in tqdm(range(0, len(texts), chunk_size)):
    text_chunk = texts[i:i+chunk_size].tolist()

    encs = tokenizer.encode_batch(text_chunk)

    # Extemded the list 'all_ids' with encoded Ids
    all_idx.extended(enc.ids for enc in encs)

  return np.array(all_ids)

In [None]:
# Loading Tokenizer of pre-trained model
bert_tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

In [None]:
# Saving the tokenizer locally
tokenizer_bert.save_pretrained('.')

In [None]:
# Loading a faster tokenizer using the core tokenizer vocabulary
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase = False)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data['Processed_text'].values,
                                                      train_data['sentiment'].values,
                                                      test_size = 0.2,
                                                      random_state = 42,
                                                      stratify = train_data['sentiment'])

In [None]:
# Maximum length used in text
max_length = 100

In [None]:
# Apply tokenization
X_train_encoded = encode(X_train, fast_tokenizer, maxlen = max_length)
X_valid_encoded = encode(X_valid, fast_tokenizer, maxlen = max_length)
X_test_encoded = encode(X_test, fast_tokenizer, maxlen = max_length)

In [None]:
label_encoder = LabelEncoder()

y_train_le = label_encoder.fit_transform(y_train)
y_valid_le = label_encoder.transform(y_valid)
y_test_le = label.transform(test_data['sentiment'])

y_train_encoded = to_categorical(y_train_le)
y_valid_encoded = to_categorical(y_valid_le)
y_test_encoded = to_categorical(y_test_le)

In [None]:
BATCH_SIZE = 16

In [None]:
train_dataset = (tf.data.Dataset
                 .from_tensor_slices((X_train_encoded, y_train_encoded))
                 .repeat()
                 .shuffle(2048)
                 .batch(BATCH_SIZE)
                 )

In [None]:
valid_dataset = (tf.data.Dataset
                 .from_tensor_slices((X_valid_encoded, y_valid_encoded))
                 .batch(BATCH_SIZE)
                 .cache()
                 )

In [None]:
test_dataset = (tf.data.Dataset
                .from_tensor_slices((X_test_encoded, y_test_encoded))
                .batch(BATCH_SIZE)
                )

# **Create Model**

In [None]:
# Function to apply fine-tune and train a transformer model (e.g. BERT)
def create_model(transformer, max_len = 512):

  # Define input layer
  input_words_ids = tf.keras.layers.Input(shape = (max_len),
                                          dtype = tf.int32,
                                          name = 'input_word_ids')

  # Define sequence output of model
  sequence_output = transformer(input_words_ids)[0]

  # Select the first token of each sequence (token CLS in bert) for classification
  cls_token = sequence_output[:, 0, :]

  # Add a layer dense for classifcation
  out = Dense(6, activation = 'softmax')(cls_token)

  # Create keras model with input and output selected
  model = tf.keras.Model(inputs = input_word_ids, output = out)

  model.compile(tf.keras.optimizers.legacy.Adam(learning_rate = le-5),
                loss = 'categorical_crossentropy',
                metrics = ['accuracy', Precision(), Recall(), AUC()])

  return model


In [None]:
# Create an instance of pretrained DistilBERT multilingual
transformer_layer = (transformers.TFDistilBertModel.from_pretrained('distilbert=base-multilingual-cased'))

In [None]:
model = create_model(transformer_layer, max_len = max_length)

In [None]:
# Freezing pretained layers
model.layers[0].trainable = False
model.layers[1].trainable = False
model.layers[2].trainable = False

In [None]:
n_steped = X_train_encoded.shape[0] // BATCH_SIZE
num_epochs = 3

In [None]:
%%time
history = model.fit(train_dataset,
                    steps_per_epoch = n_steps,
                    validation_data = valid_dataset,
                    epochs = num_epochs)

In [None]:
loss, val_loss = history.history['loss'], history.history['val_loss']
plt.plot(loss, label = 'Loss')
plt.plot(val_loss, label = 'Validation Loss')
plt.legend()
plt.show()

In [None]:
predict = model.predict(X_test_encoded)

In [None]:
predicted_labels = predict.argmax(axis = 1)

In [None]:
print(classification_report(y_test_le, predicted_labels))

In [None]:
print(confusion_matrix(y_test_le, predicted_labels))

In [None]:
print(accuracy_score(y_test_le, predicted_labels))

In [None]:
model.save('./models/model_v3.keras', save_format='tf')

# **Deploy**

In [None]:
# Loading Model
# Imports
from transformers import TFDistilBertModel
from tensorflow.keras.utils import custom_object_scope

# Register personalized layer and load the model
with custom_object_scope({'TFDistilBertModel': TFDistilBertModel}):
  loaded_model = tf.keras.models.load_model('./models/model_v3.keras')

In [None]:
phrase = 'i even feel a little shaky'

In [None]:
df = pd.DataFrame({'Phrase': [phrase]})

In [None]:
_df['Processed_phrase'] = df['Phrase'].apply(preprocessing_text)

In [None]:
new_data = encode(df['Processed_phrase'], fast_tokenizer, max_len = max_length)

In [None]:
result = loaded_model.predict(new_data)

In [None]:
prob = np.argmax(result, axis = 1)

In [None]:
name_cls = label_encoder.inverse_transform(prob)
name_cls

In [None]:
%env TOKENIZERS_PARALLELISM = false