# Word preprocessing for sentiment analysis

In [1]:
#************************************************************************
#      __   __  _    _  _____   _____
#     /  | /  || |  | ||     \ /  ___|
#    /   |/   || |__| ||    _||  |  _
#   / /|   /| ||  __  || |\ \ |  |_| |
#  /_/ |_ / |_||_|  |_||_| \_\|______|
#    
# 
#   Written by < Daniel L. Marino (marinodl@vcu.edu) > (2016)
#
#   Copyright (2016) Modern Heuristics Research Group (MHRG)
#   Virginia Commonwealth University (VCU), Richmond, VA
#   http://www.people.vcu.edu/~mmanic/
#   
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#  
#   Any opinions, findings, and conclusions or recommendations expressed 
#   in this material are those of the author's(s') and do not necessarily 
#   reflect the views of any other entity.
#  
#   ***********************************************************************
#
#   Description: downloading and formating of the large movie review dataset
#
#   ***********************************************************************

In [2]:
from vocabulary_coding_simple import *
from twodlearn.tf_lib.datasets import LargeMovieReview
import pickle
import re


def preprocessing(text_in):
    text_in = re.sub('[^A-Za-z ]+', ' ', text_in) # replaces anything that is not a character with spaces
    text_in = text_in.lower()
    
    return text_in


imdb, vc = LargeMovieReview.read_data_sets("imdb_data/", 
                                           batch_size= 64, 
                                           num_unrollings= 64, 
                                           validation_p= 0.1,
                                           custom_preproc= preprocessing,
                                           vc_size= 4000,
                                           shuffle= False
                                          ) 


Successfully downloaded aclImdb_v1.tar.gz 84125825 bytes.
Decompressing data file...
Reading training text files...
Reading testing text files...
Parsing text files...
Applying custom preprocessing...
Building vocabulary...
Size of string to build the dictionary: 32377684
Encoding texts...
Creating datasets...


In [3]:
print(imdb.train._n_classes)

x, y = imdb.train.next_batch()
print(len(x))
print(y.shape)
print(x[0].shape)

print(vc.keys2text([np.argmax(x[i][0,:], 0) for i in range(len(x))]))
print(y[0])

print(vc.keys2text([np.argmax(x[i][32,:], 0) for i in range(len(x))]))
print(y[32])

2
65
(64, 1)
(64, 4001)
of the late s when the had just finish and the profession wa still be produc there s someth lack about the race game one trailer featur a car over take anoth on a way if it d been a trailer for the you d see jack over take a car and beat a confess out of the who d done a while the profession 
[ 0.]
it wa veri origin whoever thought up thi movi get a stand from me the act wa great luke perri did an excel job onc again i give thi movi the highest rate kenneth branagh s hamlet hit all the mark the act is magnific the cinematographi is gorgeou the oscar nomin costum and set are stun and patrick s score also oscar nomin is 
[ 1.]


In [4]:
print([i[0] for i in vc.key_list[:1000]])

['N/A', '', 'the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'thi', 'that', 's', 'movi', 'wa', 'film', 'as', 'for', 'with', 'but', 't', 'you', 'on', 'be', 'not', 'have', 'he', 'hi', 'are', 'one', 'all', 'at', 'they', 'like', 'by', 'an', 'who', 'so', 'from', 'there', 'her', 'or', 'just', 'about', 'out', 'ha', 'if', 'what', 'time', 'some', 'good', 'make', 'can', 'more', 'she', 'charact', 'when', 'get', 'see', 'veri', 'watch', 'up', 'stori', 'even', 'no', 'my', 'would', 'which', 'onli', 'realli', 'their', 'had', 'well', 'we', 'do', 'me', 'were', 'other', 'scene', 'look', 'than', 'show', 'much', 'end', 'will', 'peopl', 'bad', 'go', 'been', 'great', 'also', 'into', 'first', 'becaus', 'love', 'think', 'how', 'him', 'don', 'way', 'act', 'most', 'play', 'made', 'thing', 'then', 'them', 'could', 'too', 'ani', 'after', 'know', 'say', 'seem', 'work', 'plot', 'two', 'year', 'actor', 'come', 'mani', 'seen', 'take', 'life', 'want', 'never', 'littl', 'best', 'where', 'over', 'tri', 'did', 'off',

In [5]:
pickle.dump( vc, open( "imdb_vc.pkl", "wb" ) )
pickle.dump( imdb, open( "imdb_dataset.pkl", "wb" ) )