In [0]:
# Written by Michael Naples 4/10/18

In [0]:
#Trained on a Tesla K80 (2496 CUDA cores @ 3.7GHz - 12GB) - Thanks Google
!nvidia-smi

Wed Apr 10 20:36:49 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.56       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import matplotlib.pyplot as plt
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, CuDNNLSTM, Embedding, Bidirectional, Dropout
from keras.optimizers import RMSprop, Adam
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import pickle

In [0]:
import sys
import keras
import tensorflow as tf
print(sys.version)
print()
print(tf.VERSION)
print()
print(keras.__version__)

3.6.7 (default, Oct 22 2018, 11:32:17) 
[GCC 8.2.0]

1.13.1

2.2.4


In [0]:
!unzip /content/twitter_embs.zip

Archive:  /content/twitter_embs.zip
  inflating: twitter_embs.txt        


In [0]:
with open('training.pkl', 'rb') as handle:
    training = pickle.load(handle)
with open('labels.pkl', 'rb') as handle:
    labels = pickle.load(handle)
with open('twitter_embs.txt', 'r') as handle:
    embeddings = handle.readlines()

In [0]:
training = np.asarray(training)
labels = np.asarray(labels)

In [0]:
words = {}  # Dictionary of all unique words from every tweet
for sent in training:
  for word in sent:
    if word not in words:
      words[word] = 0
    else:
      words[word] += 1

In [0]:
embs = {}  # Mapping from each word to its embedding
for i, line in enumerate(embeddings):
    split = line.split()
    word = split[0]
    if word in words or word == '<unknown>':
        embedding = np.array([float(val) for val in split[1:]])
        embs[word] = embedding
del embeddings

In [0]:
missing_words = {}  # Dictionary of every word without an embedding, use the <unknown> token fo these.
words['<unknown>'] = 0
for word in words.keys():
  if word not in embs:
    missing_words[word] = 0
print(len(missing_words))
print(len(words))
print(len(embs))

409763
503776
94013


In [0]:
embedding_matrix = []  # Embedding matrix the network uses
int_to_word = []       # Index to word
word_to_int = {}       # word to index
i = 0
for word, emb in embs.items():
    embedding_matrix.append(emb)
    int_to_word.append(word)
    word_to_int[word] = i
    i += 1
    
#embedding_matrix.append(np.zeros(200))
embedding_matrix = np.asarray(embedding_matrix)

In [0]:
ndx = word_to_int['cool']  # Sanity check
print(ndx)
print(int_to_word[ndx])
print(np.array_equal(embs['cool'], embedding_matrix[ndx]))
print(embedding_matrix.shape) # (number of words, embedding size) 

488
cool
True
(94013, 200)


In [0]:
for i in range(training.shape[0]):  # Im pretty sure this is pointless
    string = ''
    for j in range(len(training[i])):
        string += training[i][j] + ' '
    training[i] = string

In [0]:
test_data = training[training.shape[0]-50000:]  # Chop off 50,000 for testing
test_labels = labels[labels.shape[0]-50000:]
train_data = training[:training.shape[0]-50000]
train_labels = labels[:labels.shape[0]-50000]

num_words = len(embs)

In [0]:
def tokenize_strings(data, words_found=None, words_missing=None):  # Convert each word to its index into the embedding matrix (tokens)
  data_tokens = []
  for i in range(data.shape[0]):  
      data_tokens.append([])
      for word in data[i].split():
          if word in embs:
              data_tokens[i].append(word_to_int[word])
              if words_found is not None:
                words_found += 1
          else:
              data_tokens[i].append(word_to_int['<unknown>'])  # Use the <unknown> token for words without an embedding
              if words_missing is not None:
                words_missing += 1
  if words_found is not None and words_missing is not None:
    return data_tokens, words_found, words_missing
  return data_tokens

In [0]:
num_words_found = 0
num_words_missing = 0

train_data_tokens, num_words_found, num_words_missing = tokenize_strings(train_data, num_words_found, num_words_missing)
test_data_tokens, num_words_found, num_words_missing = tokenize_strings(test_data, num_words_found, num_words_missing)

In [0]:
print("Number of words embedding found: %d" % num_words_found)
print("Number of words embedding missing: %d" % num_words_missing)
print('Percent of unknown words: {:.2%}'.format(num_words_missing / num_words_found))

Number of words embedding found: 16453626
Number of words embedding missing: 154764
Percent of unknown words: 0.94%


In [0]:
print(train_data_tokens[0])
print(test_data_tokens[0])

[0, 7, 23, 1275, 45, 11587, 26, 50, 286, 13, 13, 1387, 771, 8]
[26, 1258, 69052, 603, 1, 1, 201, 928, 1171, 3570, 7]


In [0]:
def tokens_to_string(tokens):  # Convert tokens back into their sting value
    words = [int_to_word[token] for token in tokens if token != 0]
    text = " ".join(words)
    return text
print(train_data_tokens[0])
print(tokens_to_string(train_data_tokens[0]))

[0, 7, 23, 1275, 45, 11587, 26, 50, 286, 13, 13, 1387, 771, 8]
<url> - aww so toro is your baby ? ? soo sweet !


In [0]:
num_tokens = [len(tokens) for tokens in train_data_tokens + test_data_tokens]
num_tokens = np.asarray(num_tokens)
max_tokens = np.max(num_tokens)  # Find the longest tweet

In [0]:
max_tokens += 25

In [0]:
# Pad each example with 0's so they all match the length of the longest tweet
# In TensorFlow all sequences must be the same length
pad = 'pre'
train_data_pad = pad_sequences(train_data_tokens, maxlen=max_tokens,
                              padding=pad, truncating=pad)
test_data_pad = pad_sequences(test_data_tokens, maxlen=max_tokens,
                             padding=pad, truncating=pad)

In [0]:
print(train_data_pad.shape)

(949985, 144)


In [0]:
model = Sequential()  # Create the network's computation graph, Bidirectional-LSTM-RNN
model.add(Embedding(input_dim=embedding_matrix.shape[0],
                   output_dim=embedding_matrix.shape[1],
                   input_length=max_tokens,
                   weights=[embedding_matrix],
                   trainable=False,
                   name='embedding_layer'))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(CuDNNLSTM(32, return_sequences=True)))
model.add(Dropout(0.15))
model.add(Bidirectional(CuDNNLSTM(16)))
model.add(Dropout(0.1))
#model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(lr=1e-3)

model.compile(loss='binary_crossentropy',   # Compile the network
             optimizer=optimizer,
             metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 144, 200)          18802600  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 144, 128)          136192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 144, 128)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 144, 64)           41472     
_________________________________________________________________
dropout_2 (Dropout)          (None, 144, 64)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 32)                10496     
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
__________

In [0]:
# Train the model, about 50,000 examples are set aside for validation
#%%time   
model.fit(train_data_pad, train_labels,
         validation_split=0.05, epochs=10, batch_size=1024)

Instructions for updating:
Use tf.cast instead.
Train on 902485 samples, validate on 47500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f59f2e941d0>

In [0]:
# Evaluate model accuracy on test data
result = model.evaluate(test_data_pad, test_labels)
print("accuracy: {0:.2%}".format(result[1]))

accuracy: 84.02%


In [0]:
# Test on individual tweets
tweet = "so bored at work <number> and a half days til wichita falls"
token_tweet = tokenize_strings(np.asarray([tweet]))
tweet_pad = pad_sequences(token_tweet, maxlen=max_tokens,
                              padding=pad, truncating=pad)
result = model.evaluate(tweet_pad, [0])
if result[1] == 1:
  print('negative')
else:
  print('positive')

negative


In [0]:
model_json = model.to_json()  # Save model to disk
with open("model8402.json", "w") as json_file:
    json_file.write(model_json)
    
model.save_weights("model8402.h5")
print("Saved model to disk")

Saved model to disk


In [0]:
from keras.models import model_from_json  # Load model from files
json_file = open('model8402.json', 'r')
model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(model_json)
loaded_model.load_weights("model8402.h5")
print("Loaded model from disk")

optimizer = Adam(lr=1e-3)
loaded_model.compile(loss='binary_crossentropy',
             optimizer=optimizer,
             metrics=['accuracy'])

Loaded model from disk


In [0]:
with open('test1.pkl', 'rb') as handle:
    test1 = pickle.load(handle)
with open('labels1.pkl', 'rb') as handle:
    labels1 = pickle.load(handle)

In [0]:
test1 = np.asarray(test1)
for i in range(test1.shape[0]):  # Im pretty sure this is pointless
    string = ''
    for j in range(len(test1[i])):
        string += test1[i][j] + ' '
    test1[i] = string

In [0]:
test1_tokens = tokenize_strings(test1)

In [0]:
test1_data_pad = pad_sequences(test1_tokens, maxlen=max_tokens,
                              padding=pad, truncating=pad)

In [0]:
results = loaded_model.predict(test1_data_pad)

In [0]:
with open('test1_out.txt', 'w+') as f:
  for i in range(len(results)):
    if results[i] > .5:
      f.write(labels1[i] + '\t\t' + '+' + '\n')
    else:
      f.write(labels1[i] + '\t\t' + '-' + '\n')