In [22]:
import numpy as np
import sys
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.metrics import classification_report
from keras.layers import LSTM

# Performs classification using CNN.

FREQ_DIST_FILE = '../twitter_data/3-sentiment-processed-train-freqdist.pkl'
BI_FREQ_DIST_FILE = '../twitter_data/3-sentiment-processed-train-freqdist-bi.pkl'
TRAIN_PROCESSED_FILE = '../twitter_data/3-sentiment-processed-train.csv'
TEST_PROCESSED_FILE = '../twitter_data/3-sentiment-processed-X-test.csv'
TEST_LABEL_FILE = '../twitter_data/3-sentiment-processed-y-test.csv'
GLOVE_FILE = '../dataset/glove-seeds.txt'
REPORT_FILE = './reports/3-sentiments.csv'
train = False
dim = 200


def get_glove_vectors(vocab):
    """
    Extracts glove vectors from seed file only for words present in vocab.
    """
    print('Looking for GLOVE seeds')
    glove_vectors = {}
    found = 0
    with open(GLOVE_FILE, 'r', encoding='utf-8') as glove_file:
        for i, line in enumerate(glove_file):
            utils.write_status(i + 1, 0)
            tokens = line.strip().split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                glove_vectors[word] = np.array(vector)
                found += 1
    print('\n')
    return glove_vectors


def get_feature_vector(tweet):
    """
    Generates a feature vector for each tweet where each word is
    represented by integer index based on rank in vocabulary.
    """
    words = tweet.split()
    feature_vector = []
    for i in range(len(words) - 1):
        word = words[i]
        if vocab.get(word) is not None:
            feature_vector.append(vocab.get(word))
    if len(words) >= 1:
        if vocab.get(words[-1]) is not None:
            feature_vector.append(vocab.get(words[-1]))
    return feature_vector


def process_tweets(csv_file, test_file=True):
    """
    Generates training X, y pairs.
    """
    tweets = []
    labels = []
    print('Generating feature vectors')
    with open(csv_file, 'r', encoding="utf-8") as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                tweet_id, tweet = line.split(',')
            else:
                tweet_id, sentiment, tweet = line.split(',')
                # Convert sentiment labels to one-hot encoding
                sentiment_onehot = [0, 0, 0]
                if sentiment == "1":
                    sentiment_onehot[2] = 1
                elif sentiment == "0":
                    sentiment_onehot[1] = 1
                elif sentiment == "-1":
                    sentiment_onehot[0] = 1
                labels.append(sentiment_onehot)
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append(feature_vector)
            else:
                tweets.append(feature_vector)
            utils.write_status(i + 1, total)
    print('\n')
    return tweets, np.array(labels)

 

In [2]:
np.random.seed(1337)
vocab_size = 90000
batch_size = 500
max_length = 30
filters = 600
kernel_size = 3
vocab = utils.top_n_words(FREQ_DIST_FILE, vocab_size, shift=1)


tweets, labels = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
print(len(tweets), len(labels))
glove_vectors = get_glove_vectors(vocab)
# Create and embedding matrix
embedding_matrix = np.random.randn(vocab_size + 1, dim) * 0.01
# Seed it with GloVe vectors
for word, i in vocab.items():
    glove_vector = glove_vectors.get(word)
    if glove_vector is not None:
        embedding_matrix[i] = glove_vector
tweets = pad_sequences(tweets, maxlen=max_length, padding='post')
shuffled_indices = np.random.permutation(tweets.shape[0])
tweets = tweets[shuffled_indices]
labels = labels[shuffled_indices]

Generating feature vectors
Processing 85224/85224

85224 85224
Looking for GLOVE seeds
Processing 1193517/0



In [25]:
model = Sequential()
model.add(Embedding(vocab_size + 1, dim, weights=[embedding_matrix], input_length=max_length))
model.add(Dropout(0.4))
model.add(LSTM(128))
model.add(Dense(64))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
filepath = "./models/lstm-{epoch:02d}-{loss:0.3f}-{val_loss:0.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor="loss", verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.000001)
print(model.summary())
model.fit(tweets, labels, batch_size=128, epochs=5, validation_split=0.1, shuffle=True, callbacks=[checkpoint, reduce_lr])

[[    1  2134     2 ...     0     0     0]
 [   25   103   308 ...     0     0     0]
 [   99    10  6838 ...    60     1    47]
 ...
 [67025     0     0 ...     0     0     0]
 [    1    52    14 ...     0     0     0]
 [   26     1    13 ...     0     0     0]] [[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [0 1 0]
 [0 0 1]
 [0 1 0]]
Epoch 1/8


ValueError: in user code:

    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\Users\ljh\miniconda3\envs\tensorflow\lib\site-packages\keras\backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 3) and (None, 1) are incompatible


In [15]:
MODEL_FILE = './models/4cnn-08-0.080-0.200.hdf5'
model = load_model(MODEL_FILE)
print(model.summary())
test_tweets, _ = process_tweets(TEST_PROCESSED_FILE, test_file=True)
test_tweets = pad_sequences(test_tweets, maxlen=max_length, padding='post')
predictions = model.predict(test_tweets, batch_size=128, verbose=1)
results = np.argmax(predictions, axis=1) - 1 # Convert back to original labels (-1, 0, 1)
id_results = zip(map(str, range(len(test_tweets))), results)
# utils.save_results_to_csv(id_results, 'cnn.csv')
test_label = utils.file_number_to_list(TEST_LABEL_FILE)
report = classification_report(test_label, results, target_names=['negative', 'neutral', 'positive'], output_dict=True)
print(classification_report(test_label, results, target_names=['negative', 'neutral', 'positive'], output_dict=False))
df_report = pd.DataFrame(report).transpose()
# df_report.to_csv(REPORT_FILE)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 30, 200)           18000200  
                                                                 
 dropout_8 (Dropout)         (None, 30, 200)           0         
                                                                 
 conv1d_20 (Conv1D)          (None, 28, 600)           360600    
                                                                 
 conv1d_21 (Conv1D)          (None, 26, 300)           540300    
                                                                 
 conv1d_22 (Conv1D)          (None, 24, 150)           135150    
                                                                 
 conv1d_23 (Conv1D)          (None, 22, 75)            33825     
                                                                 
 flatten_5 (Flatten)         (None, 1650)             