In [1]:
! pip install transformers



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import spacy
import tensorflow as tf

In [3]:
dataset = pd.read_csv('https://raw.githubusercontent.com/JuliaFC/PLN/74febd023863dae3724d700df0698a2874cef420/intents.csv?token=AHYYKOOXHE3LBDXT65A3JODA7HIKE')
dataset = dataset.replace(r'\s+|\\n', ' ', regex=True) #Removing \n from the dataset

In [4]:
dataset['intents'] = dataset.intents.astype('category').cat.codes
dataset

Unnamed: 0,texts,intents
0,what movies did Temuera Morrison act in?,0
1,what movies did Evelyn Venable act in?,0
2,what does Tom Cullen act in?,0
3,what movies was Shareeka Epps an actor in?,0
4,what does Peter FranzÃ©n appear in?,0
...,...,...
8871,which movie did Bob Brunner write?,11
8872,which film did Mark Rosenthal write the story ...,11
8873,what films was Andrew Deutschman a writer on?,11
8874,what movies was Mark Bowden the writer of?,11


In [5]:
def get_array(x):
  array = [0,0,0,0,0,0,0,0,0,0,0,0]
  array[x] = 1
  return array

def transform_into_binary_array(dataset):
  transformed_dataset = []
  for element in dataset:
    transformed_dataset.append(get_array(element))
  return transformed_dataset

In [6]:
intents = dataset['intents']
texts = dataset['texts']
X_train, X_test, y_train, y_test = train_test_split(texts, intents, test_size=0.5, random_state=1) #Separando primeiramente em 50% treinamento
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1) # Separando agora em 25% teste e 25% validacao

y_train = transform_into_binary_array(y_train) # Transformando em uma lista que contem elementos de forma [0,0,0,0,0,0,0,0,0,0,0,0], onde o elemento da posição x tem valor 1, ou seja, é a intent daquele elemento
y_val = transform_into_binary_array(y_val)
y_test = transform_into_binary_array(y_test)

In [14]:
X_train

6133                          which words describe Frisk?
2804               the director of The Blue Umbrella was?
1922                      who starred in American Sniper?
4051          what genre does My Prairie Home fall under?
2924             who is listed as director for Fast Five?
                              ...                        
2895              who is the director for The Fatal Hour?
7813               which movies are about beauty pageant?
905              what movies was Debra Paget an actor in?
5192    what is the language spoken in Sailor of the K...
235                        what does Kevin Bacon star in?
Name: texts, Length: 4438, dtype: object

In [15]:
y_train

[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0,

In [16]:
#Criando o encoder e usando todos os textos para adaptá-lo
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(np.array(texts))

In [17]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'what', 'of', 'was', 'in', 'who', 'is',
       'movie', 'film', 'which', 'did', 'movies', 'director', 'a',
       'directed', 'released', 'genre', 'films'], dtype='<U12')

In [18]:
# Criando a rede neural
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(12)
])

In [19]:
# Compilando a rede
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [20]:
#Fazendo o fit com a rede
history = model.fit(np.array(X_train), np.array(y_train),validation_data=(np.array(X_val), np.array(y_val)), validation_steps=30, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
predicted = model.predict(np.array(X_test)) # Fazendo a predição para o caso de teste
labels = [] # Array responsável por guardar o valor predito pela rede (ou seja, o elemento da posição da intent com maior chance de ser escolhida)
true_value = [] # Array responsável por guardar os verdadeiros valores da intent (do conjunto de teste)
for element in range(len(predicted)):
  max_element_index = np.argmax(predicted[element])
  true_value_index = np.argmax(y_test[element])
  labels.append(max_element_index)
  true_value.append(true_value_index)

Test Loss: 0.03280423581600189
Test Accuracy: 0.9981973767280579


In [27]:
print(labels) #Print dos valores preditos
print(true_value) # Print dos reais valores

[2, 1, 9, 4, 4, 2, 4, 3, 0, 9, 7, 9, 2, 8, 2, 2, 0, 10, 9, 11, 9, 8, 2, 0, 0, 0, 9, 8, 0, 0, 10, 4, 10, 3, 4, 9, 3, 4, 9, 3, 2, 3, 9, 3, 4, 9, 0, 1, 4, 9, 0, 2, 11, 8, 9, 3, 8, 0, 10, 1, 0, 4, 3, 8, 4, 11, 4, 4, 3, 11, 3, 3, 8, 11, 7, 8, 9, 9, 2, 4, 0, 1, 9, 8, 3, 3, 3, 2, 1, 7, 7, 3, 9, 11, 4, 9, 7, 4, 10, 1, 4, 10, 0, 4, 3, 3, 9, 11, 1, 1, 2, 1, 4, 9, 1, 11, 9, 3, 9, 3, 3, 2, 3, 3, 3, 11, 4, 3, 2, 0, 8, 7, 0, 8, 2, 9, 8, 7, 10, 11, 11, 7, 11, 10, 3, 0, 9, 2, 2, 11, 3, 9, 0, 7, 2, 11, 2, 9, 4, 2, 11, 9, 11, 2, 2, 2, 3, 8, 9, 0, 0, 4, 0, 1, 0, 3, 9, 3, 2, 8, 3, 3, 7, 11, 11, 3, 1, 11, 0, 9, 11, 9, 9, 2, 8, 4, 9, 11, 3, 7, 11, 4, 3, 10, 9, 0, 0, 11, 2, 11, 4, 0, 8, 9, 7, 4, 0, 4, 11, 3, 9, 1, 3, 4, 3, 11, 0, 8, 9, 4, 9, 2, 2, 11, 0, 0, 9, 11, 8, 3, 4, 9, 0, 8, 11, 9, 9, 0, 2, 3, 3, 9, 9, 3, 11, 8, 9, 11, 0, 8, 4, 3, 0, 11, 1, 4, 1, 9, 10, 8, 9, 11, 4, 8, 9, 2, 3, 1, 8, 9, 4, 3, 10, 0, 8, 9, 3, 8, 2, 11, 10, 0, 9, 3, 3, 2, 11, 2, 9, 0, 4, 9, 3, 0, 1, 3, 9, 9, 9, 8, 9, 4, 1, 0, 1, 9, 3, 8

In [29]:
from sklearn.metrics import classification_report
#Imprimindo a precisão, recall e f1
print(classification_report(true_value, labels, zero_division=0))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       236
           1       0.99      1.00      0.99       137
           2       1.00      1.00      1.00       269
           3       1.00      1.00      1.00       350
           4       1.00      1.00      1.00       277
           5       0.00      0.00      0.00         3
           7       1.00      1.00      1.00        71
           8       1.00      1.00      1.00       210
           9       1.00      1.00      1.00       366
          10       1.00      1.00      1.00       101
          11       1.00      1.00      1.00       199

    accuracy                           1.00      2219
   macro avg       0.91      0.91      0.91      2219
weighted avg       1.00      1.00      1.00      2219

