In [None]:
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from gensim.corpora.dictionary import Dictionary
import multiprocessing

from random import shuffle

import numpy as np


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.ensemble import RandomForestClassifier

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout

np.random.seed(1500)  # For Reproducibility

import logging
import sys


from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd


In [None]:
def tokenizer(text):
    text = [clean_tweet(document) for document in text]
    return text

def clean_tweet(tweet):
    lemmatizer=WordNetLemmatizer()
    word_list = tweet.split()
    filtered_words=[word for word in word_list if word not in stopwords.words('english')]
    #repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    #repl = r'\1\2\3'
    mod_tweet = []
    for i in filtered_words:
        #i = unicode(i,errors='ignore')
        i = i.encode('utf-8',errors='ignore')
        i = i.decode('utf-8')
        i.lower()
        i.strip('#\'"?,.!')
        if '@'  in i or 'http:' in i:
            continue
        j = re.sub(r'(.)\1+',r'\1\1',i)
        mod_tweet.append(lemmatizer.lemmatize(j))
    return mod_tweet


def sentences_perm(sentences):
    shuffle(sentences)
    return sentences



In [26]:
data_dir = "/Users/Kristin/Desktop/tensorflow/Sentiment-analysis-with-word-embedding/code/"
train_data = pd.read_csv(data_dir+"train.csv",names = ['polarity','id','date','query','user','tweet'],encoding="ISO-8859-1")
test_data = pd.read_csv(data_dir+"test.csv",names = ['polarity','id','date','query','user','tweet'],encoding="ISO-8859-1")


In [None]:
log = logging.getLogger()
#log.setLevel(logging.DEBUG)

ch = logging.StreamHandler(sys.stdout)
#ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
log.addHandler(ch)


maxlen = 50
n_iterations = 10  # ideally more..
n_exposures = 30
batch_size = 32
n_epoch = 2
cpu_count = multiprocessing.cpu_count()

log.info('source load')
sources = {'negative_test.txt':'TEST_NEG', 'positive_test.txt':'TEST_POS', 'negative_train.txt':'TRAIN_NEG', 'positive_train.txt':'TRAIN_POS'}


In [5]:
def create_dictionaries(train=None,test=None,model=None):
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data
        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')


In [6]:
from random import sample

idx = sample(range(len(train_data)),100000)
train_partial = train_data.iloc[idx,:]
all_data = pd.concat([train_partial['tweet'],test_data['tweet']])

In [None]:
combined = tokenizer(all_data.tolist())

In [7]:
import pickle
pickle_off = open("combined_words.pickle","rb")
combined = pickle.load(pickle_off)

In [None]:
# save combined as pickle file
import pickle
pickling_on = open("combined_words.pickle","wb")
pickle.dump(combined,pickling_on)
pickling_on.close()

In [None]:
cpu_count

In [8]:
### trian Word2Vec model

EMBEDDING_DIM = 100
window_size = 5
w2v_model = Word2Vec(size = EMBEDDING_DIM,
                     window = window_size,
                     workers = cpu_count)

In [9]:
w2v_model.build_vocab(combined)

In [10]:
for epoch in range(10):
    w2v_model.train(sentences_perm(combined),total_examples=w2v_model.corpus_count,epochs =epoch)



In [11]:
train = train_partial['tweet'].reset_index(drop=True).to_dict()
test = test_data['tweet'].to_dict()

In [12]:
index_dict,word_vectors,train,test = create_dictionaries(train=train,
                                                         test=test,
                                                         model = w2v_model)

  import sys


{'Cool!': array([-1.89140886e-01,  2.95128077e-01,  7.80254483e-01,  6.20356910e-02,
        -1.80759206e-01, -4.66767281e-01,  4.93199259e-01,  1.66976213e-01,
        -4.54562634e-01,  2.09394902e-01, -1.80864766e-01, -3.07890266e-01,
        -8.28458220e-02, -1.22710504e-01, -9.77916270e-02, -5.58980942e-01,
         9.99607891e-02, -2.70957857e-01, -3.89284462e-01,  9.41608012e-01,
        -8.49842310e-01,  4.94480699e-01,  1.83034483e-02,  6.03861809e-01,
         1.45531639e-01,  7.28703380e-01,  5.40580489e-02,  1.37469798e-01,
        -1.83981508e-01, -1.15724474e-01,  1.45880222e-01,  4.33371872e-01,
         1.70015499e-01, -7.48666301e-02, -2.39446148e-01,  1.38304457e-01,
         7.05662668e-01,  2.03798637e-01, -4.97653365e-01, -1.06971242e-01,
        -9.50889364e-02, -3.90051723e-01,  9.70449522e-02,  2.64756858e-01,
        -9.34098810e-02, -1.11111417e-01,  2.53047228e-01, -7.71583855e-01,
         4.08322155e-01,  2.84785032e-02, -4.17498112e-01, -9.89434898e-01,
   

In [27]:
padding_maxlen = 1000

print('Setting up Arrays for Keras Embedding Layer...')
n_words = len(index_dict) + 1  # adding 1 to account for 0th index
embedding_weights = np.zeros((n_words, EMBEDDING_DIM))
for word, index in index_dict.items():
    embedding_weights[index, :] = word_vectors[word]

print('Creating Datesets...')
X_train = train.values()
y_train = train_partial['polarity'].apply(lambda x: 1 if x==4 else 0 ).tolist()
X_test = test.values()
y_test = test_data['polarity'].apply(lambda x: 1 if x==4 else 0 ).tolist()

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=padding_maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=padding_maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Convert labels to Numpy Sets...')
y_train = np.array(y_train)
y_test = np.array(y_test)



Setting up Arrays for Keras Embedding Layer...
Creating Datesets...
Pad sequences (samples x time)
X_train shape: (100000, 1000)
X_test shape: (498, 1000)
Convert labels to Numpy Sets...


In [15]:
embedding_weights.shape

(14356, 100)

In [29]:

print('Defining a Simple Keras Model...')
lstm_model = Sequential()  # or Graph 
lstm_model.add(Embedding(output_dim=EMBEDDING_DIM,
                    input_dim=n_words,
                    mask_zero=True,
                    weights=[embedding_weights],
                    input_length=padding_maxlen))  # Adding Input Length

lstm_model.add(LSTM(EMBEDDING_DIM))
lstm_model.add(Dropout(0.3))
lstm_model.add(Dense(1, activation='sigmoid'))

print('Compiling the Model...')
lstm_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'],
          class_mode='binary')

print("Train...")
lstm_model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=5,
          #validation_data=(X_test, y_test))
               validation_split=0.2)

print("Evaluate...")
score, acc = lstm_model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)



Defining a Simple Keras Model...
Compiling the Model...
Train...




ValueError: ('Some keys in session_kwargs are not supported at this time: %s', dict_keys(['class_mode']))