# Note
* 1.loading data
* 2.tokenizing data to the list
* 3.build the vacabulary to integer
* 4.padding the doc

In [1]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
from smart_open import smart_open
import re
import nltk
import numpy as np
import pickle
import itertools
from collections import Counter, OrderedDict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences
import re
from os import listdir
import string
from nltk.corpus import stopwords

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
dirname = 'data/aclImdb'
filename = 'aclImdb_v1.tar.gz'
locale.setlocale(locale.LC_ALL, 'C')
all_lines = []

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

if not os.path.isfile('aclImdb/alldata-id.txt'):
    if not os.path.isdir(dirname):
        if not os.path.isfile(filename):
            # Download IMDB archive
            print("Downloading IMDB archive...")
            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
            r = requests.get(url)
            with smart_open(filename, 'wb') as f:
                f.write(r.content)
        # if error here, try `tar xfz aclImdb_v1.tar.gz` outside notebook, then re-run this cell
        tar = tarfile.open(filename, mode='r')
        tar.extractall()
        tar.close()
    else:
        print("IMDB archive directory already available without download.")

    # Collect & normalize test/train data
    print("Cleaning up dataset...")
    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg']
    for fol in folders:
        temp = u''
        newline = "\n".encode("utf-8")
        output = fol.replace('/', '-') + '.txt'
        # Is there a better pattern to use?
        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
        print(" %s: %i files" % (fol, len(txt_files)))
        with smart_open(os.path.join(dirname, output), "wb") as n:
            for i, txt in enumerate(txt_files):
                with smart_open(txt, "rb") as t:
                    one_text = t.read().decode("utf-8")
                    for c in control_chars:
                        one_text = one_text.replace(c, ' ')
                    one_text = normalize_text(one_text)
                    all_lines.append(one_text)
                    n.write(one_text.encode("utf-8"))
                    n.write(newline)

    # Save to disk for instant re-use on any future runs
    with smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
        for idx, line in enumerate(all_lines):
            num_line = u"_*{0} {1}\n".format(idx, line)
            f.write(num_line.encode("utf-8"))

assert os.path.isfile("data/aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"
print("Success, alldata-id.txt is available for next steps.")

IMDB archive directory already available without download.
Cleaning up dataset...
 train/pos: 12500 files
 train/neg: 12500 files
 test/pos: 12500 files
 test/neg: 12500 files
Success, alldata-id.txt is available for next steps.


In [26]:
def sent_tokenize(doc):
    sent_text = nltk.sent_tokenize(doc) # this gives you a list of sentences
    return sent_text

def word_tokenize(sent):
    tokenized_text = nltk.word_tokenize(sent)  # this gives you a list of words
    tokenized_text = [token.lower() for token in tokenized_text]  # optional: convert all words to lower case
    return tokenized_text

def readfile(filename):
    with open(filename,'r',encoding='UTF-8') as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    return content

#padding the sentence
#sentences是一个影评，就是一个train_data_word[0]
#max_words是影评中句子的最大含词量
#max_sents是影评中最大的句子个数
#保证每个影评的句子个数和句子长度都一样
def pad_sent(sentences, max_words, max_sents):
    """
    Pads sequences to the same length.
    Input: sentences - List of lists, where each element is a sequence.
    - max_words: Int, maximum length of all sequences.
    """
    # pad sentences in a doc
    sents_padded = pad_sequences(sentences, maxlen=max_words, padding='post') 
    # pad a doc to have equal number of sentences
    if len(sents_padded) < max_sents:
        doc_padding = np.zeros((max_sents-len(sents_padded),max_words), dtype = int)
        sents_padded = np.append(doc_padding, sents_padded, axis=0)
    else:
        sents_padded = sents_padded[:max_sents]
    return sents_padded

#build from word to integer as the input of ''
def build_vocab(corpus):
    """
    Builds a vocabulary mapping from word to index based on the corpus.
    Input: list of all samples in the training data
    Return: OrderedDict - vocabulary mapping from word to integer.
    """
    # Build vocabulary
    corpus_2d = []  # convert 3d corpus to 2d list
    for doc in corpus:
        for sent in doc:
            corpus_2d.append(sent)
    word_counts = Counter(itertools.chain(*corpus_2d))
    # Mapping from index to word (type: list)
    vocabulary = ['<PAD/>', '<UKN/>']   # 0 for padding, 1 for unknown words
    vocabulary = vocabulary + [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    #如何避免呢
    vocab2int = OrderedDict({x: i for i, x in enumerate(vocabulary)})
    return vocab2int

#****这个corpus是几维呢
def build_input_data(corpus, vocab2int, max_words, max_sents):
    """
    Maps words in the corpus to integers based on a vocabulary.
    Also pad the sentences and documents into fixed shape
    Input: corpus - list of samples, each sample is a list of sentences, each sentence is a list of words
    """
    corpus_int = [[[getWordIdx(word, vocab2int) for word in sentence]for sentence in sample] for sample in corpus]
    corpus_padded = []
    for doc in corpus_int:
        corpus_padded.append(pad_sent(doc, max_words, max_sents))
    corpus_padded = np.array(corpus_padded)    
    return corpus_padded

def load_embedding_matrix(embed_path, vocab2int, EMBEDDING_DIM, embed_type='glove'):
    """
    return embedding_matrix 
    embedding_matrix[i] is the embedding for 'vocab2int' integer index i
    
    """
    embeddings = {}
    embeddings['<PAD/>'] = np.zeros(EMBEDDING_DIM) # Zero vector for '<PAD/>' word
    embedding_UKN = np.random.uniform(-0.10, 0.10, EMBEDDING_DIM)  # Vector of small random numbers for unknown words
    # embedding_UKN = vector / np.linalg.norm(embedding_UKN)   # Normalize to unit vector
    embeddings['<UKN/>'] = embedding_UKN
    if embed_type == 'word2vec': 
        """Loads 300x1 word vecs from Google (Mikolov) word2vec: GoogleNews-vectors-negative300.bin"""
        with open(embed_path, "rb") as f:
            header = f.readline()
            vocab_size, layer1_size = map(int, header.split())
            binary_len = np.dtype('float32').itemsize * layer1_size
            for line in range(vocab_size):
                word = []
                while True:
                    ch = f.read(1)
                    if ch == ' ':
                        word = ''.join(word)
                        break
                    if ch != '\n':
                        word.append(ch)
                word = word.decode('utf-8', 'ignore')
                embeddings[word] = np.fromstring(f.read(binary_len), dtype='float32')
                continue
    else:
        # load Glove or Dependency-based word embeddings
        f = open(embed_path)
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
        f.close()
    embedding_matrix = np.zeros((len(vocab2int) , EMBEDDING_DIM))
    for word, i in vocab2int.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:   # word is unknown
            embedding_vector = np.random.uniform(-0.10, 0.10, EMBEDDING_DIM)  # Vector of small random numbers for unknown words
            # embedding_vector = vector / np.linalg.norm(embedding_vector)   # Normalize to unit vector
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [4]:
#get the movie review "list of string"
test_neg = readfile('D:/code_stock/SA_Code/data/aclImdb/test-neg.txt')
test_pos = readfile('D:/code_stock/SA_Code/data/aclImdb/test-pos.txt')
train_neg = readfile('D:/code_stock/SA_Code/data/aclImdb/train-neg.txt')
train_pos = readfile('D:/code_stock/SA_Code/data/aclImdb/train-pos.txt')

# test_neg = readfile('data/aclImdb/test-neg.txt')
# test_pos = readfile('data/aclImdb/test-pos.txt')
# train_neg = readfile('data/aclImdb/train-neg.txt')
# train_pos = readfile('data/aclImdb/train-pos.txt')

In [5]:
#use these lists to label the movie reviews
test_neg_label = [0 for i in range(len(test_neg))]
test_pos_label = [1 for i in range(len(test_pos))]
train_neg_label =[0 for i in range(len(train_neg))]
train_pos_label =[1 for i in range(len(train_pos))]


#merge the test label
test_label = test_neg_label + test_pos_label

#merge the train label
train_label = train_neg_label + train_pos_label

#merge the test data
test_data = test_neg + test_pos

#merge the train data
train_data = train_neg + train_pos

#shuffule the these lists
from sklearn.utils import shuffle 
train_data , train_label = shuffle(train_data , train_label , random_state = 0)
tese_data , test_label = shuffle(test_data ,train_label , random_state = 0) 

In [6]:
#tokenize the two lists of reviews into two lists of list of sentences
train_data_sent = [sent_tokenize(train_data[i]) for i in range(len(train_data))]
test_data_sent = [sent_tokenize(test_data[i]) for i in range(len(test_data))]  

In [7]:
#tokenize the two lists of list of sentences into two lists of list of list of word
train_data_word = [[]for i in range(len(train_data_sent))]
for i in range(len(train_data_sent)):
    for j in range(len(train_data_sent[i])):
        #some mistakes,I need to find a better to add element to the list
        train_data_word[i].append(word_tokenize(train_data_sent[i][j]))  

In [22]:
print('**************************')
print('slicing the train_data_word list for obeservation./n')
print(train_data_word[:1])
print('**************************')

**************************
slicing the train_data_word list for obeservation./n
[[['what', 'can', 'i', 'add', 'that', 'the', 'previous', 'comments', 'have', "n't", 'already', 'said', '.'], ['this', 'is', 'a', 'great', 'film', 'and', 'the', 'light', 'sabre', 'duel', 'star', 'wars', 'tribute', 'has', 'to', 'be', 'seen', 'to', 'be', 'believed', '!'], ['!'], ['there', 'are', 'moments', 'of', 'genius', 'throughout', 'this', 'movie', ',', 'if', 'you', 'can', ',', 'see', 'it', 'now', '!'], ['thanks', 'again', 'to', 'rick', 'baker', 'who', 'gave', 'me', 'this', 'movie', 'many', 'years', 'ago', '!']]]
**************************


In [9]:
print('**************************')
print('train_data_word[0] = ',train_data_word[0])
print('**************************')
print('train_data_word[0][0] = ',train_data_word[0][0])
print('**************************')
print('train_data_word[0][0][0] = ',train_data_word[0][0][0])

**************************
train_data_word[0] =  [['what', 'can', 'i', 'add', 'that', 'the', 'previous', 'comments', 'have', "n't", 'already', 'said', '.'], ['this', 'is', 'a', 'great', 'film', 'and', 'the', 'light', 'sabre', 'duel', 'star', 'wars', 'tribute', 'has', 'to', 'be', 'seen', 'to', 'be', 'believed', '!'], ['!'], ['there', 'are', 'moments', 'of', 'genius', 'throughout', 'this', 'movie', ',', 'if', 'you', 'can', ',', 'see', 'it', 'now', '!'], ['thanks', 'again', 'to', 'rick', 'baker', 'who', 'gave', 'me', 'this', 'movie', 'many', 'years', 'ago', '!']]
**************************
train_data_word[0][0] =  ['what', 'can', 'i', 'add', 'that', 'the', 'previous', 'comments', 'have', "n't", 'already', 'said', '.']
**************************
train_data_word[0][0][0] =  what


In [27]:
#building the vacabulary
vocab_to_int = build_vocab(train_data_word)

[('the', 335437), ('.', 327192), (',', 276280), ('and', 163554), ('a', 162627), ('of', 145447), ('to', 135228), ('is', 110431), ('it', 96231), ('in', 93326)]


In [None]:
#padding the sentence
maxlen_word = 0
maxlen_sent = 0
pad_train_set = []
list_maxlen_sent = []
list_maxlen_word = []

#get the list which is the maxim quantity of sentence
for i in range(len(train_data_sent)):
    list_maxlen_sent.append((len(train_data_sent[i])))

#get the list which is the maxim quantity of word
for i in range(len(train_data_sent)):
    for j in range(len(train_data_sent[i])):
        list_maxlen_word.append(len(train_data_sent[i][j]))

#get max_words
maxlen_word = max(list_maxlen_word)
 
#get the max_sents
maxlen_sent = max(list_maxlen_sent)