In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.keras.optimizers import SGD
import numpy as np
import pathlib
import matplotlib.pyplot as plt
from os.path import dirname, join as pjoin
import scipy.io as sio
import h5py
import hdf5storage
import random
from tensorflow.keras.models import model_from_json
from tensorflow.keras.models import load_model
from PIL import Image
from matplotlib.image import imread
import os
from tensorflow.keras import applications
from sklearn.datasets import load_svmlight_file
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer

In [2]:
DATASET_URL = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

In [3]:
data_root_orig = tf.keras.utils.get_file(origin=DATASET_URL, fname='aclImdb', untar=True)
data_root = pathlib.Path(data_root_orig)
print(data_root)

nltk.download('punkt')

/home/maxim/.keras/datasets/aclImdb


[nltk_data] Downloading package punkt to /home/maxim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
PATH_TO_TRAIN = str(data_root) + '/train'
PATH_TO_TEST =str(data_root) + '/test'

In [5]:
import re

NUM_TOKEN = '<num>'
UNKNOWN_TOKEN = '<unk>'

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

def review_to_indices(text, vocab):
    indecies = []
    for text_word in text:
        word = text_word.lower()
        index = -1
        # try find word in vocab
        try:
            index = vocab.index(word)
        except:
            # if we can't find it - try to parse it to int
            try:
                i = int(word)
                index = vocab.index(NUM_TOKEN)
            except:
                # else it unknown
                index = vocab.index(UNKNOWN_TOKEN)
        indecies.append(index)
    return indecies
                
   

In [34]:
# load data

# load vocab

vocab = []

with open(str(data_root) + '/imdb.vocab', 'r') as file:
    vocab = file.read().splitlines()
    vocab.append(NUM_TOKEN)
    vocab.append(UNKNOWN_TOKEN)
    print('Added <NUM> and <UNK> tokens')
    print('Vocab len =', len(vocab))
    
tokenizer = RegexpTokenizer(r'\w+')

total_review_len = 0
total_review_count = 0

reviews = []

for review_name in os.listdir(PATH_TO_TRAIN + '/pos'):
    mark = int(review_name.split('_')[1].split('.')[0])
    path = PATH_TO_TRAIN + '/pos/' + review_name
    with open(path, 'r') as file:
        data = file.read()
        clean_review = cleanhtml(data)
        indecies = review_to_indices(tokenizer.tokenize(clean_review), vocab)
        total_review_len += len(indecies)
        total_review_count += 1
        reviews.append((indecies, mark))

        
for review_name in os.listdir(PATH_TO_TRAIN + '/neg'):
    mark = int(review_name.split('_')[1].split('.')[0])
    path = PATH_TO_TRAIN + '/neg/' + review_name
    with open(path, 'r') as file:
        data = file.read()
        clean_review = cleanhtml(data)
        indecies = review_to_indices(tokenizer.tokenize(clean_review), vocab)
        total_review_len += len(indecies)
        total_review_count += 1
        reviews.append((indecies, mark))
        

print(len(reviews))
print(reviews[10])

print(len(reviews))
print(reviews[13000])

Added <NUM> and <UNK> tokens
Vocab len = 89529
25000
([9, 5, 2, 49, 214, 70, 621, 1527, 15, 82, 6, 5, 2, 931, 1127, 0, 356, 1182, 22, 321, 1, 0, 109, 5, 255, 6, 39, 19, 178, 48, 252, 6515, 9118, 6, 528, 79, 7, 5495, 59, 1469, 6, 388, 2, 526, 3, 2, 145, 1425, 418, 1, 19, 2, 1527, 15, 16, 3, 254, 6, 5, 2, 1527, 1, 2, 62, 48, 26, 14, 314, 1281, 461, 809, 1, 1357, 1, 30, 0, 93, 0, 381, 5, 902, 103, 44, 13, 9, 26], 8)
25000
([2, 175, 621, 4300, 89528, 989, 363, 739, 374, 3, 1761, 1, 562, 3, 2045, 47, 1468, 151, 1605, 2693, 743, 3, 2914, 1825, 1, 75, 1342, 353, 18, 2, 2482, 2559, 9, 43, 156, 4, 81, 14, 17075, 29, 216, 19, 7, 54, 1123, 5360, 301, 9, 1123, 278, 763, 5, 477, 38215, 37593, 1, 43, 5847, 41659, 981, 419, 6376, 4817, 1, 31241, 82, 63, 1334, 538, 23, 26, 21, 528, 0, 60, 4835, 91, 20, 24, 6376, 4817, 1, 31241, 981, 245, 78, 82, 33, 32151, 1, 0, 1334, 3309, 1, 20, 24, 31241, 981, 419, 286, 82, 21, 528, 40, 0, 1082, 1, 87, 4495, 19282, 535, 1190, 10, 528, 6, 55, 49, 11494, 7, 0, 739, 2

In [52]:
average_review_len = int(total_review_len / total_review_count)

print(total_review_len)
print(total_review_count)
print(average_review_len)

5954856
25000
238
25000


In [94]:
x = []
y = []

np_vocab = np.asarray(vocab)

random.shuffle(reviews)

print(reviews[0][0][:average_review_len])

for review in reviews:
    review_rep = np.asarray(review[0])
#     one_hot_rep = np.zeros((vocab.size, review_rep.max() + 1))
#     one_hot_rep[np.arange(review_rep.size), review_rep] = 1
    y.append(review[1])
    
y = np.asarray(y)
print(y)
# print(review_rep)
# one_hot_rep = np.zeros((np_vocab.size, 1))
# one_hot_rep[review_rep[0], 0] = 1.
# one_hot_rep[0, 0]
# len(one_hot_rep)
# one_hot_rep[np.arange(review_rep.size), review_rep] = 1
# print(review_rep[0])
# print(one_hot_rep[0][review_rep[0]])

[0, 15018, 5, 26, 3, 26776, 9238, 528, 87, 13444, 1112, 16, 7, 9, 1121, 1203, 0, 1188, 40, 17353, 854, 37, 9, 301, 1230, 178, 2703, 4, 9238, 528, 193, 61, 35, 26, 66, 854, 4535, 0, 1767, 18, 290, 40498, 528, 221, 42, 224, 0, 2074, 1771, 14, 0, 20387, 721, 0, 12514, 1023, 1, 0, 1178, 15052, 575, 13588, 26, 58, 25, 201, 4, 514, 9, 61, 4, 94, 13, 2, 653, 5605, 242, 1109, 15, 294, 0, 17, 5, 2, 5333, 734, 1534, 10, 444, 175, 2256, 4, 103, 408, 32, 22, 19, 1066, 14, 0, 61, 139, 387, 318, 0, 263, 84, 28, 0, 17, 76, 2647, 5, 4, 265, 92, 129, 318, 45, 5, 159, 2, 408, 528, 343, 179, 7663, 6128, 575, 3, 4145, 3580, 442, 7, 2, 582, 96, 2797, 9499, 478, 0, 2646, 19695, 3, 1996, 39406, 26, 2066, 51, 5, 22356, 140, 0, 1990, 50, 51, 3151, 2, 2482, 800, 1253, 31517, 183, 0, 2019, 876, 0, 800, 7433, 6, 43, 73, 107, 35, 6, 4127, 7, 140, 0, 1990, 1, 33507, 6128, 34, 38, 1411, 8903, 38, 80, 89, 4100, 6700, 155, 8737, 80, 0, 304, 6128, 506, 2078, 10, 51, 43, 73, 3410, 30, 2, 1424, 34, 1424, 661, 1, 3447, 10

array([[1.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [7]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

max_features = 20000
# cut texts after this number of words (among top max_features most common words)
maxlen = 80
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

y_train

Loading data...


Using TensorFlow backend.


array([1, 0, 0, ..., 0, 1, 0])

In [19]:
'0_10.txt'

'10'