This example comes from this text:

Moroney, L. (2020). AI and Machine Learning for Coders. O'Reilly Media.

It is a great resource for additioin practice etc.

## Data

There are 3 types of data we're going to run through.

1. Data from tensorflow (nice because it is easy to work with)
2. Data from a CSV (lot's of data comes to us this way)
3. Data from JSON files (also lot's of data in this format)

### Tensorflow data example

We're going to start with an example from tensorflow datasets, because it is nicely formatted. 

You can follow along in this workbook or on tensorflow using the links below.

https://colab.research.google.com/github/lmoroney/tfbook/blob/master/chapter5/imdb.ipynb

I will keep this workbook as close to the colab example as I can. 

If you need to install tensorflow_datasets:

pip install tensorflow-datasets

(you don't need to run this example... this is just for those of you that want to try it out)

In [1]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np

ModuleNotFoundError: No module named 'tensorflow'

In [2]:
imdb_sentences = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train"))
for item in train_data:
    imdb_sentences.append(str(item['text']))
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)
print(tokenizer.word_index)
print(sequences[123])

[753, 2, 30, 144, 1, 313, 6, 3, 319, 393, 23, 66, 86, 9, 20, 37, 1, 88, 817, 18, 11, 393, 13, 1, 88, 29, 10, 215, 10, 385, 14, 3, 184, 128, 113, 21, 3058, 16, 62, 838, 12, 1510, 8, 8, 261, 1450, 675, 8, 1241, 21, 4214, 10, 215, 11, 393, 2, 10, 13, 3598, 5, 1, 204, 94, 2, 1448, 10, 70, 207, 77, 3, 340, 4, 188, 843, 197, 2, 431, 945, 100, 2, 16, 11, 197, 651, 32, 4214, 10, 171, 70, 1764, 11, 393, 113, 1, 88, 29, 10, 215, 6, 917, 15, 72, 2, 29, 4, 62, 2652, 9, 20, 42, 36, 745, 16, 3, 332, 1019, 2, 30, 144, 1, 313, 47, 3, 332, 961, 21, 3, 990, 3841, 64, 6, 3, 410, 95, 5, 78, 22, 15, 3196, 73, 565, 22, 27, 1084, 3769, 35, 32, 1, 95, 299, 8, 457, 2881, 1076, 504, 642, 397, 534, 14, 3, 4728, 320, 35, 304, 467, 4, 39, 138, 1085, 21, 49, 276, 2, 3, 158, 889, 66, 38, 91, 188, 197, 182, 191, 3, 478, 15, 1, 247, 2, 79, 81, 78, 50, 33, 1862, 1, 4036, 24, 3111, 54, 3, 1835, 14, 1984, 2968, 37, 3, 722, 313, 228, 15, 3, 531, 3196, 21, 990, 3841, 14, 37, 1, 153, 965, 1048, 704, 3112, 22, 858, 79, 81, 7

In [3]:
from bs4 import BeautifulSoup
import string

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

table = str.maketrans('', '', string.punctuation)

imdb_sentences = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train"))
for item in train_data:
    sentence = str(item['text'].decode('UTF-8').lower())
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    words = sentence.split()
    filtered_sentence = ""
    for word in words:
        word = word.translate(table)
        if word not in stopwords:
            filtered_sentence = filtered_sentence + word + " "
    imdb_sentences.append(filtered_sentence)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=25000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)
print(tokenizer.word_index)



### CSV Example



In [2]:

import tensorflow as tf


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'tensorflow'

In [5]:
from bs4 import BeautifulSoup
import string

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

table = str.maketrans('', '', string.punctuation)

In [6]:
import csv
sentences=[]
labels=[]
with open('binary-emotion.csv', encoding='UTF-8') as csvfile:
  reader = csv.reader(csvfile, delimiter=",")
  for row in reader:
    labels.append(int(row[0]))
    sentence = row[1].lower()
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    words = sentence.split()
    filtered_sentence = ""
    for word in words:
        word = word.translate(table)
        if word not in stopwords:
            filtered_sentence = filtered_sentence + word + " "
    sentences.append(filtered_sentence)
    
print(len(labels))
print(len(sentences))

35327
35327


In [7]:
# fair warning, people on twitter use bad words
sentences[0:5]

['know listenin bad habit earlier started freakin part  ',
 'layin n bed headache ughhhh waitin call ',
 'wants hang friends soon ',
 'want trade someone houston tickets no one will ',
 're  pinging didnt go prom bc bf didnt like friends ']

In [8]:
labels[0:5]

[0, 0, 1, 1, 0]

In [9]:
training_size = 28000

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [10]:
vocab_size = 20000
embedding_dim = 32
max_length = 10
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
print(training_sequences[0])
print(training_padded[0])

[18, 3257, 47, 4770, 613, 508, 951, 423]
[  18 3257   47 4770  613  508  951  423    0    0]


In [12]:
print(word_index)



### JSON (JavaScript Object Notation)

Some of you might know this already, for those who don't and for a super high level overview of this:

In what should be an underwhelming statement given its origin: The JSON format is very similar to JavaScript objects. JSON values can be numbers, strings, objects, arrays Booleans, or null. Javascript values can be any valid javascript type. valid JavaScript Structure.

https://www.w3schools.com/js/js_json_datatypes.asp that website explains it in some additional detail. 

A JSON pair might look something like:

[
{"name" : "Joel",

 "from" : "Canada",
 
 "likes" : "Disney Movies"}
]

In [13]:
import tensorflow as tf


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
from bs4 import BeautifulSoup
import string

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

table = str.maketrans('', '', string.punctuation)

In [15]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)


sentences = [] 
labels = []
urls = []
for item in datastore:
    sentence = item['headline'].lower()
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    words = sentence.split()
    filtered_sentence = ""
    for word in words:
        word = word.translate(table)
        if word not in stopwords:
            filtered_sentence = filtered_sentence + word + " "
    sentences.append(filtered_sentence)
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [16]:
print(len(sentences))

26709


In [17]:
sentences[120:127]

['paul newman dies consuming 51 hard  boiled eggs ',
 'yak chews thoughtfully ',
 'man worried drug dealer not picking phone ',
 'dad recommends hotel 10 miles away city visiting ',
 'explosion fells building outside paris  killing least 2 ',
 'source donald trumps military expertise finally revealed ',
 'union claims sanders campaign staffers posed members influence workers ']

In [18]:
labels[120:127]

[1, 1, 1, 1, 0, 0, 0]

In [19]:
training_size = 23000

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [20]:
vocab_size = 20000
max_length = 10
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(training_sequences, padding='post')
print(word_index)

