In [4]:
import urllib.request
import os
import tarfile

In [5]:
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filePath = "data/aclImdb_v1.tar.gz"
# note: need to create the directory "data" by yourself
if not os.path.isfile(filePath):
    result = urllib.request.urlretrieve(url, filePath)
    print('downloaded: ', result)

In [6]:
# unzip the tar file
if not os.path.exists("data/aclImdb"):
    tempTarFile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
    result = tempTarFile.extractall('data/')

In [7]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [8]:
# using regular expression (to remove the HTML tag)
import re

In [9]:
def remove_tags(text):
    regular_expression_tag = re.compile(r'<[^>]+>')
    return regular_expression_tag.sub('',text) # replace as ''

In [10]:
import os

# read files (positive or negative)
def read_files(file_type):
    path = "data/aclImdb/"
    file_list=[]
    
    positive_path = path + file_type + "/pos/"
    for f in os.listdir(positive_path):
        file_list = file_list + [positive_path + f] 
    
    negative_path = path + file_type + "/neg/"
    for f in os.listdir(negative_path):
        file_list = file_list + [negative_path + f] 
        
    print('read', file_type, 'files: ', len(file_list) )
    
    all_labels = ( [1]*12500 + [0]*12500 )
    
    all_texts = []
    
    for f in file_list:
        with open(f, encoding='utf8') as file_input:
            all_texts = all_texts + [ remove_tags(" ".join(file_input.readlines() ) ) ] #remove html tags
    
    return all_labels, all_texts

In [11]:
y_train, x_train_text = read_files("train")

read train files:  25000


In [12]:
y_test, x_test_text = read_files("test")

read test files:  25000


In [13]:
x_train_text[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [14]:
y_train[0]

1

In [15]:
x_train_text[12500]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

In [16]:
y_train[12500]

0

In [17]:
# Use Tokenizer
token = Tokenizer(num_words=2000)
token.fit_on_texts(x_train_text)

In [18]:
print(token.document_count)

25000


In [19]:
# print(token.word_index)

In [20]:
# to sequences (list of numbers)
x_train_seq = token.texts_to_sequences(x_train_text)
x_test_seq = token.texts_to_sequences(x_test_text)

In [21]:
print( x_train_text[0] )

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [22]:
print( x_train_seq[0] )

[308, 6, 3, 1068, 208, 8, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 6, 72, 5, 631, 70, 6, 1, 5, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 222, 899, 28, 68, 4, 1, 9, 693, 2, 64, 1530, 50, 9, 215, 1, 386, 7, 59, 3, 1470, 798, 5, 176, 1, 391, 9, 1235, 29, 308, 3, 352, 343, 142, 129, 5, 27, 4, 125, 1470, 5, 308, 9, 532, 11, 107, 1466, 4, 57, 554, 100, 11, 308, 6, 226, 47, 3, 11, 8, 214]


In [23]:
x_train_final = sequence.pad_sequences(x_train_seq, maxlen=100 )
x_test_final = sequence.pad_sequences(x_test_seq, maxlen=100 )

In [24]:
print('before', len(x_train_seq[0]) )
print( x_train_seq[0] )

before 106
[308, 6, 3, 1068, 208, 8, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 6, 72, 5, 631, 70, 6, 1, 5, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 222, 899, 28, 68, 4, 1, 9, 693, 2, 64, 1530, 50, 9, 215, 1, 386, 7, 59, 3, 1470, 798, 5, 176, 1, 391, 9, 1235, 29, 308, 3, 352, 343, 142, 129, 5, 27, 4, 125, 1470, 5, 308, 9, 532, 11, 107, 1466, 4, 57, 554, 100, 11, 308, 6, 226, 47, 3, 11, 8, 214]


In [25]:
print('after', len(x_train_final[0]) )
print( x_train_final[0] )

after 100
[  29    1  168   54   13   45   81   40  391  109  137   13   57  149
    7    1  481   68    5  260   11    6   72    5  631   70    6    1
    5    1 1530   33   66   63  204  139   64 1229    1    4    1  222
  899   28   68    4    1    9  693    2   64 1530   50    9  215    1
  386    7   59    3 1470  798    5  176    1  391    9 1235   29  308
    3  352  343  142  129    5   27    4  125 1470    5  308    9  532
   11  107 1466    4   57  554  100   11  308    6  226   47    3   11
    8  214]


In [26]:
print('before', len(x_train_seq[12500]) )
print( x_train_seq[12500] )

before 93
[61, 4, 3, 128, 33, 43, 1412, 14, 3, 513, 42, 15, 3, 632, 132, 11, 6, 3, 1299, 456, 4, 1753, 208, 3, 307, 6, 675, 79, 31, 1111, 30, 1, 928, 4, 41, 468, 8, 1753, 1, 222, 54, 15, 53, 825, 1317, 846, 227, 8, 39, 95, 121, 1485, 56, 144, 35, 1, 996, 140, 25, 675, 121, 1, 411, 58, 93, 302, 770, 5, 3, 836, 19, 3, 1756, 646, 41, 124, 70, 21, 234, 100, 15, 45, 48, 626, 30, 701, 84, 701, 379, 2, 66, 25, 106]


In [27]:
print('after', len(x_train_final[12500]) )
print( x_train_final[12500] )

after 100
[   0    0    0    0    0    0    0   61    4    3  128   33   43 1412
   14    3  513   42   15    3  632  132   11    6    3 1299  456    4
 1753  208    3  307    6  675   79   31 1111   30    1  928    4   41
  468    8 1753    1  222   54   15   53  825 1317  846  227    8   39
   95  121 1485   56  144   35    1  996  140   25  675  121    1  411
   58   93  302  770    5    3  836   19    3 1756  646   41  124   70
   21  234  100   15   45   48  626   30  701   84  701  379    2   66
   25  106]
