## Sentiment analysis


In [29]:
import os
import pickle
import numpy as np
import pandas as pd
from keras.models import load_model

DATA_PATH = "../data/raw"


In [14]:
with open(os.path.join(DATA_PATH, "sentences.txt")) as sentences:
    sentences = sentences.readlines()

model = load_model(os.path.join(DATA_PATH, "sent_analysis.h5"))
X_test = np.load(os.path.join(DATA_PATH, "sent_analysis/x_test.npy"))
y_test = np.load(os.path.join(DATA_PATH, "sent_analysis/y_test.npy"))


In [17]:
# Inspect the first sentence on `X_test`
print(X_test[0])

# Get the predicion for all the sentences
pred = model.predict(X_test)

# Transform the predition into positive (> 0.5) or negative (<= 0.5)
pred_sentiment = ["positive" if x > 0.5 else "negative" for x in pred]

# Create a data frame with sentences, predictions and true values
result = pd.DataFrame(
    {"sentence": sentences, "y_pred": pred_sentiment, "y_true": y_test}
)

# Print the first lines of the data frame
print(result.head())


[[   2]
 [   5]
 [  68]
 [   2]
 [1315]
 [   9]
 [  12]
 [  32]
 [  43]
 [  44]
 [ 397]
 [2128]
 [  13]
 [ 963]
 [4637]
 [  39]
 [  68]
 [   2]
 [ 332]
 [   2]
 [  39]
 [  68]
 [   2]
 [   2]
 [  14]
 [ 418]
 [   7]
 [ 595]
 [   2]
 [   4]
 [ 130]
 [   7]
 [   6]
 [3592]
 [   7]
 [  52]
 [   5]
 [  87]
 [ 102]
 [   2]
 [  93]
 [  11]
 [   4]
 [ 402]
 [   5]
 [1696]
 [4773]
 [ 141]
 [  17]
 [   2]
 [ 251]
 [1605]
 [ 653]
 [1168]
 [ 912]
 [3295]
 [   2]
 [  51]
 [  44]
 [2046]
 [   5]
 [1659]
 [2137]
 [1033]
 [2002]
 [  69]
 [   4]
 [   2]
 [1742]
 [   7]
 [ 319]
 [  90]
 [  11]
 [1244]
 [   2]
 [ 141]
 [  17]
 [3420]
 [ 416]
 [  11]
 [3853]
 [  25]
 [  43]
 [ 191]
 [  79]
 [ 245]
 [  39]
 [ 134]
 [3282]
 [1020]
 [   5]
 [1912]
 [   2]
 [1661]
 [ 148]
 [ 107]
 [  10]
 [  10]
 [  31]
 [ 232]
 [2209]
 [ 163]
 [  17]
 [2304]
 [ 150]
 [ 198]
 [   4]
 [ 243]
 [   7]
 [ 311]
 [  14]
 [  20]
 [  16]
 [1383]
 [  18]
 [   2]
 [   2]
 [   5]
 [1472]
 [   2]]
                                       

## Language models


In [18]:
sheldon_quotes = [
    "You're afraid of insects and women, Ladybugs must render you catatonic.",
    "Scissors cuts paper, paper covers rock, rock crushes lizard, lizard poisons Spock, Spock smashes scissors, scissors decapitates lizard, lizard eats paper, paper disproves Spock, Spock vaporizes rock, and as it always has, rock crushes scissors.",
    "For example, I cry because others are stupid, and that makes me sad.",
    "I'm not insane, my mother had me tested.",
    "Two days later, Penny moved in and so much blood rushed to your genitals, your brain became a ghost town.",
    "Amy's birthday present will be my genitals.",
    "(3 knocks) Penny! (3 knocks) Penny! (3 knocks) Penny!",
    "Thankfully all the things my girlfriend used to do can be taken care of with my right hand.",
    "I would have been here sooner but the bus kept stopping for other people to get on it.",
    "Oh gravity, thou art a heartless bitch.",
    "I am aware of the way humans usually reproduce which is messy, unsanitary and based on living next to you for three years, involves loud and unnecessary appeals to a deity.",
    "Well, today we tried masturbating for money.",
    "I think that you have as much of a chance of having a sexual relationship with Penny as the Hubble telescope does of discovering at the center of every black hole is a little man with a flashlight searching for a circuit breaker.",
    "Well, well, well, if it isn't Wil Wheaton! The Green Goblin to my Spider-Man, the Pope Paul V to my Galileo, the Internet Explorer to my Firefox.",
    "What computer do you have? And please don't say a white one.",
    "She calls me moon-pie because I'm nummy-nummy and she could just eat me up.",
    "Ah, memory impairment; the free prize at the bottom of every vodka bottle.",
]


In [19]:
# Transform the list of sentences into a list of words
all_words = " ".join(sheldon_quotes).split(" ")

# Get number of unique words
unique_words = list(set(all_words))

# Dictionary of indexes as keys and words as values
index_to_word = {i: wd for i, wd in enumerate(sorted(unique_words))}

print(index_to_word)

# Dictionary of words as keys and indexes as values
word_to_index = {wd: i for i, wd in enumerate(sorted(unique_words))}

print(word_to_index)


{0: '(3', 1: 'Ah,', 2: "Amy's", 3: 'And', 4: 'Explorer', 5: 'Firefox.', 6: 'For', 7: 'Galileo,', 8: 'Goblin', 9: 'Green', 10: 'Hubble', 11: 'I', 12: "I'm", 13: 'Internet', 14: 'Ladybugs', 15: 'Oh', 16: 'Paul', 17: 'Penny', 18: 'Penny!', 19: 'Pope', 20: 'Scissors', 21: 'She', 22: 'Spider-Man,', 23: 'Spock', 24: 'Spock,', 25: 'Thankfully', 26: 'The', 27: 'Two', 28: 'V', 29: 'Well,', 30: 'What', 31: 'Wheaton!', 32: 'Wil', 33: "You're", 34: 'a', 35: 'afraid', 36: 'all', 37: 'always', 38: 'am', 39: 'and', 40: 'appeals', 41: 'are', 42: 'art', 43: 'as', 44: 'at', 45: 'aware', 46: 'based', 47: 'be', 48: 'became', 49: 'because', 50: 'been', 51: 'birthday', 52: 'bitch.', 53: 'black', 54: 'blood', 55: 'bottle.', 56: 'bottom', 57: 'brain', 58: 'breaker.', 59: 'bus', 60: 'but', 61: 'calls', 62: 'can', 63: 'care', 64: 'catatonic.', 65: 'center', 66: 'chance', 67: 'circuit', 68: 'computer', 69: 'could', 70: 'covers', 71: 'crushes', 72: 'cry', 73: 'cuts', 74: 'days', 75: 'decapitates', 76: 'deity.', 7

In [20]:
def print_examples(sentences, next_chars, n=10):
    """Function to print examples of (data,label)

    This function loops over the sentences and prints the pair of data and label,
    corresponding to the sentence and next char.
    This way, the student can check how the data was transformed.

    Args:
        sentences (list): the prepared data
        next_chars (string): the label containing the next char of the sentences
        n (int): the number of examples to print

    Returns:
        nothing

    """
    result = "Sentence\tNext char\n"
    n_i = 1
    for sent, char in zip(sentences, next_chars):
        if n_i >= n:
            break
        result = result + sent + "\t" + char + "\n"
        n_i += 1
    print(result)


In [25]:
sheldon_quotes = "".join(sheldon_quotes)


In [26]:
# Create lists to keep the sentences and the next character
sentences = []  # ~ Training data
next_chars = []  # ~ Training labels

# Define hyperparameters
step = 2  # ~ Step to take when reading the texts in characters
chars_window = 10  # ~ Number of characters to use to predict the next one

# Loop over the text: length `chars_window` per time with step equal to `step`
for i in range(0, len(sheldon_quotes) - chars_window, step):
    sentences.append(sheldon_quotes[i : i + chars_window])
    next_chars.append(sheldon_quotes[i + chars_window])

# Print 10 pairs
print_examples(sentences, next_chars, 10)


Sentence	Next char
You're afr	a
u're afrai	d
re afraid 	o
 afraid of	 
fraid of i	n
aid of ins	e
d of insec	t
of insects	 
 insects a	n



In [42]:
new_text = [
    "A man either lives life as it happens to him meets it head-on and licks it or he turns his back on it and starts to wither away",
    "To the brave crew and passengers of the Kobayshi Maru sucks to be you",
    "Beware of more powerful weapons They often inflict as much damage to your soul as they do to you enemies",
    "They are merely scars not mortal wounds and you must use them to propel you forward",
    "You cannot explain away a wantonly immoral act because you think that it is connected to some higher purpose",
]

index_to_word = pickle.load(open(os.path.join(DATA_PATH, "index_to_word.pkl"), "rb"))
word_to_index = pickle.load(open(os.path.join(DATA_PATH, "word_to_index.pkl"), "rb"))
vocabulary = pd.read_csv(
    os.path.join(DATA_PATH, "vocabulary.csv"), header=None
).values.tolist()


In [43]:
# Loop through the sentences and get indexes
new_text_split = []
for sentence in new_text:
    sent_split = []
    for wd in sentence.split(' '):
        index = word_to_index.get(wd, 0)
        sent_split.append(index)
    new_text_split.append(sent_split)

# Print the first sentence's indexes
print(new_text_split[0])

# Print the sentence converted using the dictionary
print(' '.join([index_to_word[index] for index in new_text_split[0]]))

[276, 15070, 10160, 14750, 14590, 5715, 13813, 12418, 22564, 12797, 15443, 13813, 0, 5368, 14578, 13813, 16947, 12507, 23031, 12859, 5975, 16795, 13813, 5368, 21189, 22564, 0, 5910]
A man either lives life as it happens to him meets it <UKN/> and licks it or he turns his back on it and starts to <UKN/> away


## RNNs in Keras


In [46]:
from keras.models import Model, Sequential
from keras.layers import Dense, LSTM, Input


In [47]:
# Instantiate the class
model = Sequential(name="sequential_model")

# One LSTM layer (defining the input shape because it is the
# initial layer)
model.add(LSTM(128, input_shape=(None, 10), name="LSTM"))

# Add a dense layer with one unit
model.add(Dense(1, activation="sigmoid", name="output"))

# The summary shows the layers and the number of parameters
# that will be trained
model.summary()


Model: "sequential_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 LSTM (LSTM)                 (None, 128)               71168     
                                                                 
 output (Dense)              (None, 1)                 129       
                                                                 
Total params: 71,297
Trainable params: 71,297
Non-trainable params: 0
_________________________________________________________________


In [48]:
# Define the input layer
main_input = Input(shape=(None, 10), name="input")

# One LSTM layer (input shape is already defined)
lstm_layer = LSTM(128, name="LSTM")(main_input)

# Add a dense layer with one unit
main_output = Dense(1, activation="sigmoid", name="output")(lstm_layer)

# Instantiate the class at the end
model = Model(inputs=main_input, outputs=main_output, name="modelclass_model")

# Same amount of parameters to train as before (71,297)
model.summary()


Model: "modelclass_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, None, 10)]        0         
                                                                 
 LSTM (LSTM)                 (None, 128)               71168     
                                                                 
 output (Dense)              (None, 1)                 129       
                                                                 
Total params: 71,297
Trainable params: 71,297
Non-trainable params: 0
_________________________________________________________________


In [53]:
texts = np.load(os.path.join(DATA_PATH, "texts.npy"))


In [54]:
# Import relevant classes/functions
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Build the dictionary of indexes
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Change texts into sequence of indexes
texts_numeric = tokenizer.texts_to_sequences(texts)
print(
    "Number of words in the sample texts: ({0}, {1})".format(
        len(texts_numeric[0]), len(texts_numeric[1])
    )
)

# Pad the sequences
texts_pad = pad_sequences(texts_numeric, 60)
print(
    "Now the texts have fixed length: 60. Let's see the first one: \n{0}".format(
        texts_pad[0]
    )
)


Number of words in the sample texts: (54, 78)
Now the texts have fixed length: 60. Let's see the first one: 
[ 0  0  0  0  0  0 24  4  1 25 13 26  5  1 14  3 27  6 28  2  7 29 30 13
 15  2  8 16 17  5 18  6  4  9 31  2  8 32  4  9 15 33  9 34 35 14 36 37
  2 38 39 40  2  8 16 41 42  5 18  6]


In [61]:
from keras.layers import SimpleRNN

x_test = np.load(os.path.join(DATA_PATH, "rnn/x_test.npy"))
y_test = np.load(os.path.join(DATA_PATH, "rnn/y_test.npy"))


In [63]:
# Build model
model = Sequential()
model.add(SimpleRNN(units=128, input_shape=(None, 1)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Load pre-trained weights
model.load_weights(os.path.join(DATA_PATH, "model_weights.h5"))

# Method '.evaluate()' shows the loss and accuracy
loss, acc = model.evaluate(x_test, y_test, verbose=0)
print("Loss: {0} \nAccuracy: {1}".format(loss, acc))


Loss: 0.6991181373596191 
Accuracy: 0.4950000047683716
