In [1]:
# DataFrame
import pandas as pd

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

nltk.download('stopwords')

# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

dataset_path = os.path.join("input","training.1600000.processed.noemoticon.csv")
print("Open file:", dataset_path)
df = pd.read_csv(dataset_path, encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

texts = []
labels = []
texts_test = []
labels_test = []
for i in range (45000):
    texts.append(df.text[i+5000])
    labels.append(0)
for i in range (45000):
    texts.append(df.text[i+805000])
    labels.append(1)
for i in range (5000):
    texts_test.append(df.text[i])
    labels_test.append(0)
for i in range (5000):
    texts_test.append(df.text[i+800000])
    labels_test.append(1)
    
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 35  # We will cut reviews after 100 words
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)  # https://keras.io/preprocessing/text/
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)  # This turns strings into lists of integer indices.

d = 0
for i in range (len(sequences)):
    if len(sequences[i-d]) == 0:
        del sequences[i-d]
        del labels[i-d]
        d = d+1   

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(max_words, 128, input_length=maxlen))
model.add(LSTM(128,
              dropout=0.5,
              recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liyuantan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Open file: input/training.1600000.processed.noemoticon.csv


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 51459 unique tokens.
Shape of data tensor: (89235, 35)
Shape of label tensor: (89235,)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 35, 128)           1280000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,300,641
Trainable params: 1,300,641
Non-trainable params: 0
_________________________________________________________________


In [2]:
%%time
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(data[0:69405], labels[0:69405],
                    epochs=10,
                    batch_size=128)
model.save_weights('lstm_twitter_test.h5')

Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 3min 34s, sys: 41.5 s, total: 4min 16s
Wall time: 1min 39s


In [3]:
sequences = tokenizer.texts_to_sequences(texts_test)

d = 0
for i in range (len(sequences)):
    if len(sequences[i-d]) == 0:
        del sequences[i-d]
        del labels_test[i-d]
        d = d+1  

x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels_test)

In [4]:
%%time
model.evaluate(x_test, y_test)

CPU times: user 5.01 s, sys: 942 ms, total: 5.95 s
Wall time: 2.27 s


[0.507052836718554, 0.7576033141416192]