In [28]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/sarcasm.json'

sentences = [
    'Eu amo meu cachorro',
    'Eu amo meu gato',
    'Este é o Card vinte e dois'
]
tokenizer = Tokenizer(num_words = 100, oov_token = '(OOV)') # atribui as 100 palavras mais frequentes a tokens
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'(OOV)': 1, 'eu': 2, 'amo': 3, 'meu': 4, 'cachorro': 5, 'gato': 6, 'este': 7, 'é': 8, 'o': 9, 'card': 10, 'vinte': 11, 'e': 12, 'dois': 13}


In [10]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences) # cria listas dentro de um array de acordo com os tokens das palavras

[[2, 3, 4, 5], [2, 3, 4, 6], [7, 8, 9, 10, 11, 12, 13]]


In [11]:
padded = pad_sequences(sequences, padding = 'post')
print(padded) # normaliza as sequencias, mantend-as com a mesma quantidade de indices, atribuindo espaços vazios a 0

[[ 2  3  4  5  0  0  0]
 [ 2  3  4  6  0  0  0]
 [ 7  8  9 10 11 12 13]]


In [12]:
import json



In [29]:
datastore = []
with open(path, 'r') as f:
    datastore = json.load(f)

In [30]:
datastore[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [34]:
sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [35]:
sentences[:5]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way']

In [36]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [37]:
training_sentences = sentences[0:training_size]
training_labels = labels[0:training_size]

testing_sentences =sentences[training_size:]
testing_labels = labels[training_size:]

In [38]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# repete todos os processos de tokenização, sequencing e padding no "dataset" fornecido no json, e os separa em treinamento e teste

In [39]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [40]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
]) # cria uma rede com uma camada de embedding, uma de pooling, uma oculta de 24 nodos e uma de saida ativada por sigmoide,
# pois retorna uma probabilidade de 0 a 100 da sequencia ser sarcasmo
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])



In [41]:
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 5s - 8ms/step - accuracy: 0.5644 - loss: 0.6805 - val_accuracy: 0.7442 - val_loss: 0.6633
Epoch 2/30
625/625 - 3s - 5ms/step - accuracy: 0.7269 - loss: 0.5538 - val_accuracy: 0.8043 - val_loss: 0.4562
Epoch 3/30
625/625 - 2s - 3ms/step - accuracy: 0.8188 - loss: 0.4098 - val_accuracy: 0.8296 - val_loss: 0.3989
Epoch 4/30
625/625 - 2s - 3ms/step - accuracy: 0.8481 - loss: 0.3554 - val_accuracy: 0.8213 - val_loss: 0.3902
Epoch 5/30
625/625 - 3s - 4ms/step - accuracy: 0.8694 - loss: 0.3154 - val_accuracy: 0.8202 - val_loss: 0.3875
Epoch 6/30
625/625 - 3s - 4ms/step - accuracy: 0.8820 - loss: 0.2875 - val_accuracy: 0.8249 - val_loss: 0.3824
Epoch 7/30
625/625 - 4s - 7ms/step - accuracy: 0.8923 - loss: 0.2615 - val_accuracy: 0.8091 - val_loss: 0.4158
Epoch 8/30
625/625 - 3s - 4ms/step - accuracy: 0.9006 - loss: 0.2446 - val_accuracy: 0.8538 - val_loss: 0.3426
Epoch 9/30
625/625 - 3s - 4ms/step - accuracy: 0.9118 - loss: 0.2202 - val_accuracy: 0.8429 - val_loss: 0.3590
E

In [52]:
sentence = [
    'granny starting to fear spiders in the garden might be real',
    'the weather today is bright and sunny'
]

sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[[9.9273854e-01]
 [9.6404913e-04]]
