In [1]:
import numpy as np
import sys
import pickle
from tqdm import tqdm
from tensorflow.keras import layers
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

Using TensorFlow backend.


In [2]:
# define some helper functions

def get_feature_vector(tweet):
    """
    Transform a given tweet to feature vector representaion
    """
    words = tweet.split()
    feature_vector = []
    for i in range(len(words) - 1):
        word = words[i]
        if vocab.get(word) is not None:
            feature_vector.append(vocab.get(word))
    if len(words) >= 1:
        if vocab.get(words[-1]) is not None:
            feature_vector.append(vocab.get(words[-1]))
    return feature_vector

def process_tweets(csv_file):
    """
    Invoke get_feature_vector to batch process these tweets,
    and return the processed tweets and labels
    """
    tweets = []
    labels = []
    print('Generating feature vectors...')
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i in tqdm(range(len(lines))):
            line = lines[i]
            tweet_id, label, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            tweets.append(feature_vector)
            if int(label) == 4:
                label = 1
            labels.append(int(label))
    return tweets, np.array(labels)

def top_n_words(pkl_file_name, N, shift=0):
    """
    Return the top-N words from the unigrams file
    """
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    words = {p[0]: i + shift for i, p in enumerate(most_common)}
    return words

def get_glove_vectors(vocab):
    """
    Return the glove vectors by combining the top-N words
    with the Glove file
    """
    print('\nLooking for GLOVE vectors...')
    glove_vectors = {}
    found = 0
    with open(glove_twitter_vectors, 'r', encoding="utf8") as glove_file:
        for line in glove_file:
            tokens = line.split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                glove_vectors[word] = np.array(vector)
                found += 1
    print('Found %d words in GLOVE' % found)
    return glove_vectors

In [3]:
# Prepare training & testing data, as well as some parameters.

# Assign some paths
processed_csv = '/home/mingj/project/tweet_training_data/large/sentiment140-processed.csv'
unigrams_file = '/home/mingj/project/tweet_training_data/large/sentiment140-freqdist.pkl'
bigrams_file = '/home/mingj/project/tweet_training_data/large/sentiment140-freqdist-bi.pkl'
glove_twitter_vectors = '/home/mingj/project/tweet_training_data/glove.twitter.27B.200d.txt'

# Define some hyperparameters
vocab_size = 90000
max_length = 60 # 40->60
batch_size = 128
epochs = 20 # 10->20
learning_rate = 0.0001 # 0.001->0.0001
filters = 600
kernel_size = 3
dim = 200

# Prepare integerized tweets and labels
vocab = top_n_words(unigrams_file, vocab_size, shift=1)
tweets, labels = process_tweets(processed_csv)
tweets = pad_sequences(tweets, maxlen=max_length, padding='post')
shuffled_indices = np.random.permutation(tweets.shape[0])
tweets = tweets[shuffled_indices]
labels = labels[shuffled_indices]

# Prepare embedding matrix
embedding_matrix = np.random.randn(vocab_size + 1, dim) * 0.01
glove_vectors = get_glove_vectors(vocab)
for word, i in vocab.items():
    glove_vector = glove_vectors.get(word)
    if glove_vector is not None:
        embedding_matrix[i] = glove_vector

# Split the data to training, validation, and testing sets
rnd_indices = np.random.rand(len(labels)) < 0.8
train_x = tweets[rnd_indices]
train_y = labels[rnd_indices]
remain_x = tweets[~rnd_indices]
remain_y = labels[~rnd_indices]
rnd_indices2 = np.random.rand(len(remain_y)) < 0.25
val_x = remain_x[rnd_indices2]
val_y = remain_y[rnd_indices2]
test_x = remain_x[~rnd_indices2]
test_y = remain_y[~rnd_indices2]
print('\n==> Training set:', train_x.shape, ' Validation set:', val_x.shape, ' Testing set:', test_x.shape, '\n')

  0%|          | 0/1048956 [00:00<?, ?it/s]

Generating feature vectors...


100%|██████████| 1048956/1048956 [00:06<00:00, 162544.26it/s]



Looking for GLOVE vectors...
Found 70388 words in GLOVE

==> Training set: (839297, 60)  Validation set: (52689, 60)  Testing set: (156970, 60) 



In [4]:
# Define the network

model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size + 1, dim, weights=[embedding_matrix], input_length=max_length))
model.add(layers.Dropout(0.4))
model.add(layers.LSTM(128))
model.add(layers.Dense(64))
model.add(layers.Dropout(0.4))
model.add(layers.Activation('relu'))
model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))

opt = tf.keras.optimizers.Adam(lr = learning_rate)
model.compile(loss='binary_crossentropy', optimizer = opt, metrics=['accuracy'])
tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_grads=False, write_images=False)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 60, 200)           18000200  
_________________________________________________________________
dropout (Dropout)            (None, 60, 200)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               168448    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
activation (Activation)      (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
__________

In [5]:
# Training the network and give the model.

model.fit(train_x, train_y, batch_size = batch_size, epochs = epochs, verbose = 1, callbacks = [tensorboard], validation_data = (val_x, val_y))
model.save('sentiment_lstm.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 839297 samples, validate on 52689 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [6]:
# Performance on testing set

print(model.metrics_names)
model.evaluate(test_x, test_y, verbose=1)

['loss', 'acc']


[0.3103561493066714, 0.8721475441167102]