In [10]:
from numpy.random import random, permutation, randn, normal, uniform, choice
from sklearn.model_selection import StratifiedShuffleSplit
from collections import Counter   #Replace this with an efficient version
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import nltk.data
import sklearn
import pickle
import bcolz
import re
import os

glove_path = 'C:\\Users\\Karthik\\Desktop\\sentiment_analysis\\imdb\\glove\\'

In [3]:
import tensorflow as tf
from tensorflow.python.keras.models import save_model
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras._impl.keras.optimizers import Adam
from tensorflow.python.keras._impl.keras.preprocessing import sequence
from tensorflow.python.keras.layers import  Convolution1D, Dense, Dropout, Embedding, Flatten, MaxPooling1D
tf.__version__

'1.4.0'

In [None]:
#One-time run.
with open(glove_path+ 'glove.6B.50d.txt', 'r', encoding="utf8") as f:
    lines = [line.split() for line in f]
    words = [d[0] for d in lines]
    vecs = np.stack(np.array(d[1:], dtype=np.float32) for d in lines)
    wordidx = {o:i for i,o in enumerate(words)}
    c=bcolz.carray(vecs, rootdir=glove_path+ 'glove.6B.50d.dat', mode='w')
    c.flush()
    pickle.dump(words, open(glove_path+'glove.6B.50d_words.pkl','wb'))
    pickle.dump(wordidx, open(glove_path+'glove.6B.50d_idx.pkl','wb'))

In [5]:
#Load the vectors from GloVe
vecs = bcolz.open(glove_path+ 'glove.6B.50d.dat')[:]
words = pickle.load(open(glove_path+'glove.6B.50d_words.pkl','rb'))
wordidx = pickle.load(open(glove_path+'glove.6B.50d_idx.pkl','rb'))

In [11]:
#User Defined function to retrieve Word Vector
def w2v(w): return vecs[wordidx[w]]

In [None]:
vecs[wordidx['awesome']]

In [12]:
def review_to_wordlist(review):
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z\-]"," ", review_text)
    words = review_text.lower().split()
    words += '.'
    return(words)

In [13]:
#punkt tokenizer for sentence splitting
#nltk.download()   
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences( review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence))
    return sentences

In [14]:
app_path = r"C:\Users\Karthik\Desktop\sentiment_analysis\imdb"

In [15]:
corpus_train = pd.read_csv(os.path.join(app_path,"labeledTrainData.tsv"), header=0, \
                    delimiter="\t", quoting=3)

In [16]:
corpus_test = pd.read_csv(os.path.join(app_path,"testData.tsv"), header=0, \
                    delimiter="\t", quoting=3)

In [17]:
unlabeled_corpus_train = pd.read_csv(os.path.join(app_path,"unlabeledTrainData.tsv"), header=0, \
                    delimiter="\t", quoting=3)

In [18]:
sentences = []  # Initialize an empty list of sentences

print ("Parsing sentences from training set")
for review in corpus_train["review"]:
    sentences += review_to_sentences(review, tokenizer)
train_sentences = list(sentences)

Parsing sentences from training set


  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [19]:
print ("Parsing sentences from unlabeled set")
for review in unlabeled_corpus_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from unlabeled set


  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [20]:
print ("Parsing sentences from test set")
for review in corpus_test["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from test set


  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)


In [None]:
print(len(sentences))
print(sentences[0])

In [21]:
#All words used in training and unlabeled datasets. ! Should test words be included here?
def accum_words(data):
    words = []
    for i in data:
        for d in i:
            words.append(d)
    return words

In [22]:
words_union = accum_words(sentences)
print('Total words in training and unlabeled dataset: ', len(words))

Total words in training and unlabeled dataset:  400000


In [23]:
cnt = Counter(words_union)
word_freq_inv = cnt.most_common()
idx = {word_freq_inv[i][0] : i for i in range(len(cnt))}
idx2word = {v: k for k, v in idx.items()}

In [24]:
import json
with open('idx.json', 'w') as f:
    json.dump(idx, f)

In [25]:
print('Length of word index {train + unlabeled}: ', idx.__len__())

Length of word index {train + unlabeled}:  201863


In [26]:
print(idx['great'])
print(idx2word[80])

80
great


In [None]:
print(vecs.shape)

In [27]:
vocab_size = 15000

In [28]:
n_fact = vecs.shape[1]
emb = np.zeros((vocab_size, n_fact))

In [8]:
#emb = np.zeros((vocab_size, 50))

In [None]:
emb.shape

In [None]:
type(vecs[wordidx['great']])

In [29]:
for i in range(1,len(emb)):
    word = idx2word[i]
    if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
        try:
            src_idx = wordidx[word] #GloVe
            emb[i] = vecs[src_idx]
        except KeyError:
            emb[i] = normal(scale=0.6, size=(n_fact,))
    else:
        #random initialization for missing words
        emb[i] = normal(scale=0.6, size=(n_fact,))

#random initialization for rare words
emb[-1] = normal(scale=0.6, size=(n_fact,))
emb/=3

In [None]:
len(emb)

In [30]:
seq_len = 1500

In [31]:
X = corpus_train[["id", "review"]]
y = corpus_train["sentiment"]

In [None]:
X.head()

In [None]:
#---****----redundant code - don't run!----****-----!
idx = {word_freq_inv[i][0] : i for i in range(len(cnt))}
idx2word = {v: k for k, v in idx.items()}

In [None]:
idx['with']

In [None]:
idx2word[0]

In [32]:
#Should EoS-'period' be handled here or within the rev2sentnc func?
def reformat_dataset_list_of_words(dataset):
    X_revw_indx = []
    X_reviews = []
    for record in dataset:
        review=[]
        indices = []
        sentences = []
        sentences += review_to_sentences(record, tokenizer)
        for sentence in sentences:
            for word in sentence:
                review.append(word)
                indices.append(idx[word])
        X_reviews.append(review)
        X_revw_indx.append(indices)
    return X_reviews, X_revw_indx


In [33]:
X_reviews, X_revw_indx = reformat_dataset_list_of_words(X['review'])

  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [None]:
y[:5]

In [None]:
X_revw_indx[0:5]

In [34]:
#split the dataset:   #Replace this with native TF for efficient splitting
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=0)

In [35]:
#Splitting the corpus into train and test
for train_index, test_index in sss.split(X, y):
    #print(train_index)
    X_train, X_test = [X_revw_indx[i] for i in train_index], [X_revw_indx[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

In [None]:
len(y_test)

In [None]:
y_train

In [None]:
X_reviews[0].__len__()

In [None]:
rev_lengths = [X_reviews[i].__len__() for i in range(len(X_reviews))]

In [None]:
print("Average sequence length: ", np.mean(np.array(rev_lengths)))
print("Maximum sequence length: ", np.max(np.array(rev_lengths)))
print("Minimum sequence length: ", np.min(np.array(rev_lengths)))

In [None]:
#tf.placeholders for generating training  records, dimensions with embedding - trim sentences.

In [36]:
#Replace words with rank > vocab_size with a constant
X_train = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in X_train]
X_test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in X_test]

In [37]:
df2 = pd.DataFrame({"review": X_test, "sentiment": y_test})
df3 = pd.DataFrame({"review": X_train, "sentiment": y_train})

In [39]:
df2.to_csv("ValidationData.csv")

In [40]:
df3.to_csv("TrainData.csv")

In [41]:
#seq_len = 500
X_train = sequence.pad_sequences(X_train, maxlen=seq_len, value=0)
X_test = sequence.pad_sequences(X_test, maxlen=seq_len, value=0)

In [42]:
y_train=np.array(y_train)
y_test = np.array(y_test)

In [44]:
#seq_len = 500, emb = [5000,50], vocab_size=5000
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, #dropout=0.2, 
              weights=[emb], trainable=False),
    Dropout(0.25),
    Convolution1D(64, 5, padding='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [45]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [50]:
model_path = "C:\\Users\\Karthik\\Desktop\\sentiment_analysis\\imdb\\"

In [47]:
from tensorflow.python.keras._impl.keras import callbacks

In [54]:
callbacks = [callbacks.ModelCheckpoint(filepath=model_path+"weights.{epoch:02d}-{val_loss:.2f}.hdf5", monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)]

In [55]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), callbacks=callbacks, epochs=2, batch_size=64)

Train on 22500 samples, validate on 2500 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras._impl.keras.callbacks.History at 0x20035ce2b00>

In [56]:
model.layers[0].trainable=True

In [57]:
model.optimizer.lr=1e-4

In [58]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), callbacks=callbacks, epochs=1, batch_size=64)

Train on 22500 samples, validate on 2500 samples
Epoch 1/1

AttributeError: 'float' object has no attribute 'eval'

In [59]:
model.save_weights(model_path+'glove50.h5')

In [60]:
save_model(
    model,
    filepath = model_path+'CNN_glove_model_with_weights_08_12_2017_2310.h5',
    overwrite=True,
    include_optimizer=False
)

In [None]:
#Not needed! ?
def tokenize_sent_words(dataset):
    sentence = []
    document = []
    for review in dataset["review"]:
        sentence += review_to_sentences(review, tokenizer)
    sentence += '.'
train_sentences = list(sentences)   

In [None]:
def accum_words(data):
    words = []
    for i in data:
        for d in i:
            words.append(d)
    return words
    #unq = set(unique_words)
    #unq_len = unq.__len__()
    #print("printing unq_len: ", unq_len)
    #return word_vectors(unq), unq_len
     

In [None]:
with tf.Session() as sess:
    print(tf.nn.embedding_lookup(wordVectors,firstSentence).eval().shape)

In [None]:
from os import listdir
from os.path import isfile, join
positiveFiles = ['positiveReviews/' + f for f in listdir('positiveReviews/') if isfile(join('positiveReviews/', f))]
negativeFiles = ['negativeReviews/' + f for f in listdir('negativeReviews/') if isfile(join('negativeReviews/', f))]
numWords = []
for pf in positiveFiles:
    with open(pf, "r", encoding='utf-8') as f:
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)       
print('Positive files finished')

for nf in negativeFiles:
    with open(nf, "r", encoding='utf-8') as f:
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)  
print('Negative files finished')

numFiles = len(numWords)
print('The total number of files is', numFiles)
print('The total number of words in the files is', sum(numWords))
print('The average number of words in the files is', sum(numWords)/len(numWords))