In [1]:
from numpy.random import random, permutation, randn, normal, uniform, choice
from sklearn.model_selection import StratifiedShuffleSplit
from collections import Counter   #Replace this with an efficient version
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import nltk.data
import sklearn
import pickle
import bcolz
import re

glove_path = 'C:\\Users\\Karthik\\Desktop\\sentiment_analysis\\imdb\\glove\\'

In [1]:
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras._impl.keras.optimizers import Adam
from tensorflow.python.keras._impl.keras.preprocessing import sequence
from tensorflow.python.keras.layers import  Convolution1D, Dense, Dropout, Embedding, Flatten, MaxPooling1D
tf.__version__

'1.4.0'

In [None]:
#One-time run.
with open(glove_path+ 'glove.6B.50d.txt', 'r', encoding="utf8") as f:
    lines = [line.split() for line in f]
    words = [d[0] for d in lines]
    vecs = np.stack(np.array(d[1:], dtype=np.float32) for d in lines)
    wordidx = {o:i for i,o in enumerate(words)}
    c=bcolz.carray(vecs, rootdir=glove_path+ 'glove.6B.50d.dat', mode='w')
    c.flush()
    pickle.dump(words, open(glove_path+'glove.6B.50d_words.pkl','wb'))
    pickle.dump(wordidx, open(glove_path+'glove.6B.50d_idx.pkl','wb'))

In [3]:
#Load the vectors from GloVe
vecs = bcolz.open(glove_path+ 'glove.6B.50d.dat')[:]
words = pickle.load(open(glove_path+'glove.6B.50d_words.pkl','rb'))
wordidx = pickle.load(open(glove_path+'glove.6B.50d_idx.pkl','rb'))

In [4]:
#User Defined function to retrieve Word Vector
def w2v(w): return vecs[wordidx[w]]

In [None]:
vecs[wordidx['awesome']]

In [5]:
def review_to_wordlist(review):
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z0-9\-]"," ", review_text)
    words = review_text.lower().split()
    words += '.'
    return(words)

In [6]:
#punkt tokenizer for sentence splitting
#nltk.download()   
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences( review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence))
    return sentences

In [7]:
corpus_train = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [8]:
unlabeled_corpus_train = pd.read_csv("unlabeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [9]:
sentences = []  # Initialize an empty list of sentences

print ("Parsing sentences from training set")
for review in corpus_train["review"]:
    sentences += review_to_sentences(review, tokenizer)
train_sentences = list(sentences)

Parsing sentences from training set


  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [10]:
print ("Parsing sentences from unlabeled set")
for review in unlabeled_corpus_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from unlabeled set


  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [11]:
print(len(sentences))
print(sentences[0])

795538
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again', '.']


In [12]:
#All words used in training and unlabeled datasets. ! Should test words be included here?
def accum_words(data):
    words = []
    for i in data:
        for d in i:
            words.append(d)
    return words

In [13]:
words_union = accum_words(sentences)
print('Total words in training and unlabeled dataset: ', len(words))

Total words in training and unlabeled dataset:  400000


In [14]:
cnt = Counter(words_union)
word_freq_inv = cnt.most_common()
idx = {word_freq_inv[i][0] : i for i in range(len(cnt))}
idx2word = {v: k for k, v in idx.items()}

In [15]:
print('Length of word index {train + unlabeled}: ', idx.__len__())

Length of word index {train + unlabeled}:  176376


In [18]:
print(idx['great'])
print(idx2word[81])

81
great


In [19]:
print(vecs.shape)

(400000, 50)


In [20]:
vocab_size = 5000

In [21]:
n_fact = vecs.shape[1]
emb = np.zeros((vocab_size, n_fact))

In [22]:
emb.shape

(5000, 50)

In [23]:
type(vecs[wordidx['great']])

numpy.ndarray

In [25]:
for i in range(1,len(emb)):
    word = idx2word[i]
    if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
        try:
            src_idx = wordidx[word] #GloVe
            emb[i] = vecs[src_idx]
        except KeyError:
            emb[i] = normal(scale=0.6, size=(n_fact,))
    else:
        #random initialization for missing words
        emb[i] = normal(scale=0.6, size=(n_fact,))

#random initialization for rare words
emb[-1] = normal(scale=0.6, size=(n_fact,))
emb/=3

In [None]:
len(emb)

In [26]:
seq_len = 500

In [27]:
X = corpus_train[["id", "review"]]
y = corpus_train["sentiment"]

In [28]:
X.head()

Unnamed: 0,id,review
0,"""5814_8""","""With all this stuff going down at the moment ..."
1,"""2381_9""","""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""","""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""","""It must be assumed that those who praised thi..."
4,"""9495_8""","""Superbly trashy and wondrously unpretentious ..."


In [None]:
#---****----redundant code - don't run!----****-----!
idx = {word_freq_inv[i][0] : i for i in range(len(cnt))}
idx2word = {v: k for k, v in idx.items()}

In [29]:
idx['with']

15

In [30]:
idx2word[0]

'the'

In [31]:
#Should EoS-'period' be handled here or within the rev2sentnc func?
def reformat_dataset_list_of_words(dataset):
    X_revw_indx = []
    X_reviews = []
    for record in dataset:
        review=[]
        indices = []
        sentences = []
        sentences += review_to_sentences(record, tokenizer)
        for sentence in sentences:
            for word in sentence:
                review.append(word)
                indices.append(idx[word])
        X_reviews.append(review)
        X_revw_indx.append(indices)
    return X_reviews, X_revw_indx


In [32]:
X_reviews, X_revw_indx = reformat_dataset_list_of_words(X['review'])

  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [None]:
y[:5]

In [None]:
X_revw_indx[0:5]

In [35]:
#split the dataset:   #Replace this with native TF for efficient splitting
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=0)

In [None]:
X_train = [X_revw_indx[i] for i in train_index]

In [None]:
y_train = [y[i] for i in train_index]

In [None]:
#y_train

In [None]:
len(X_train)

In [36]:
#Splitting the corpus into train and test
for train_index, test_index in sss.split(X, y):
    #print(train_index)
    X_train, X_test = [X_revw_indx[i] for i in train_index], [X_revw_indx[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

In [None]:
len(y_test)

In [None]:
y_train

In [None]:
X_reviews[0].__len__()

In [None]:
rev_lengths = [X_reviews[i].__len__() for i in range(len(X_reviews))]

In [None]:
print("Average sequence length: ", np.mean(np.array(rev_lengths)))
print("Maximum sequence length: ", np.max(np.array(rev_lengths)))
print("Minimum sequence length: ", np.min(np.array(rev_lengths)))

In [None]:
#tf.placeholders for generating training  records, dimensions with embedding - trim sentences.

In [37]:
#Replace words with rank > vocab_size with a constant
X_train = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in X_train]
X_test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in X_test]

In [None]:
X_train[0]

In [38]:
#seq_len = 500
X_train = sequence.pad_sequences(X_train, maxlen=seq_len, value=0)
X_test = sequence.pad_sequences(X_test, maxlen=seq_len, value=0)

In [39]:
y_train=np.array(y_train)
y_test = np.array(y_test)

In [40]:
#seq_len = 500, emb = [5000,50], vocab_size=5000
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, #dropout=0.2, 
              weights=[emb], trainable=False),
    Dropout(0.25),
    Convolution1D(64, 5, padding='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [41]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=64)

Train on 22500 samples, validate on 2500 samples
Epoch 1/2
 2240/22500 [=>............................] - ETA: 94s - loss: 0.7632 - acc: 0.5112 

In [None]:
model.layers[0].trainable=True

In [None]:
model.optimizer.lr=1e-4

In [None]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=1, batch_size=64)

In [None]:
model_path = "C:\\Users\\Karthik\\Desktop\\sentiment_analysis\\imdb\\'"

In [None]:
model.save_weights(model_path+'glove50.h5')

In [None]:
#Not needed! ?
def tokenize_sent_words(dataset):
    sentence = []
    document = []
    for review in dataset["review"]:
        sentence += review_to_sentences(review, tokenizer)
    sentence += '.'
train_sentences = list(sentences)   

In [None]:
def accum_words(data):
    words = []
    for i in data:
        for d in i:
            words.append(d)
    return words
    #unq = set(unique_words)
    #unq_len = unq.__len__()
    #print("printing unq_len: ", unq_len)
    #return word_vectors(unq), unq_len
     

In [None]:
with tf.Session() as sess:
    print(tf.nn.embedding_lookup(wordVectors,firstSentence).eval().shape)

In [None]:
from os import listdir
from os.path import isfile, join
positiveFiles = ['positiveReviews/' + f for f in listdir('positiveReviews/') if isfile(join('positiveReviews/', f))]
negativeFiles = ['negativeReviews/' + f for f in listdir('negativeReviews/') if isfile(join('negativeReviews/', f))]
numWords = []
for pf in positiveFiles:
    with open(pf, "r", encoding='utf-8') as f:
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)       
print('Positive files finished')

for nf in negativeFiles:
    with open(nf, "r", encoding='utf-8') as f:
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)  
print('Negative files finished')

numFiles = len(numWords)
print('The total number of files is', numFiles)
print('The total number of words in the files is', sum(numWords))
print('The average number of words in the files is', sum(numWords)/len(numWords))