## Parsing 

In [2]:
import numpy as np 
import pandas as pd 

try:
    import pickle
except:
    import cPickle as pickle

    
""" investigating Amazon Reviews dataset (shoes)

example entry:

product/productId: B0009PK7KO
product/title: Caligarius Women's Acheta Pump,Black Calf,6 M
product/price: unknown
review/userId: A12O8IHB65BC1S
review/profileName: Fifi
review/helpfulness: 2/2
review/score: 4.0
review/time: 1169769600
review/summary: Beautiful basic pump...
review/text: ... but not enough sizes or colors. Fits true to size on my size 8-1/2 feet.
Bottom soles are completely slick... needs some kind of texturing or tread to help prevent slipping.

"""
    
def build_reviews_list(path):
    """ build a reviews list from the input file, 
        assuming reviews are separated by '\n\n',
        get rid of the last empty review if it exits 
    """
    with open(path, 'r') as f:
        reviews = f.read()
    review_list = reviews.split('\n\n')
    if review_list[-1] == "":
        return review_list[:-1]
    else:
        return review_list

def build_review_dict(review):
    """ buuld a review dict from a review entry,
        assuming each line has format 'feature_name: feature_content'
    """
    review_dict = {}
    feature_list = review.split('\n')
    for feature in feature_list:
        feature_and_content = feature.split(': ')
        review_dict[feature_and_content[0]] = feature_and_content[1]
    return review_dict

def build_list_of_review_dict(path):
    """ build a list of review dicts 
    """
    review_list = build_reviews_list(path)
    return [build_review_dict(review) for review in review_list]
    
def save_review_dict_list(path, dic_list):
    """ save the list of review dicts with pickle binary encoding
    """
    with open(path, 'wb') as f:
        pickle.dump(dic_list, f)
    print('data saved to ' + path)

def load_review_dic_list(path):
    """ load the data and return 
    """
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data 

## embedding

In [3]:
def get_sample_score_distribution(scores):
    """ get the frequency distribution of sample scores, 
        to construct a majority predictor 
    """
    scores_dict = {}
    count = 0

    for score in scores:
        count += 1
        if score not in scores_dict:
            scores_dict[score] = 1
        else:
            scores_dict[score] += 1
     
    return scores_dict, count 

In [4]:
def get_text_and_score_lists(reviews_list):
    texts = [review['review/text'] for review in reviews_list]
    scores = [int(float(review['review/score'])) for review in reviews_list]
    return texts, scores

def score_to_categorical(scores):
    return to_categorical(np.asarray(scores))


# def filter_text_as_sentence_list(text, end="END"):
#     """ replace . with <END> for each sentence in a text entry,
#         return list of individual sentences 
#         @ not useful 
#     """
#     sentences_list = []
#     for sent in text.split('.'):
#         if sent.strip() != '':
#             sentences_list.append(sent.strip()+" "+end)
#     return sentences_list


def filter_text(text, end="<END>"):
    """ replace . with <END> for each sentence in a text entry 
    """
    sentences_list = []
    for sent in text.split('.'):
        if sent.strip() != '':
            sentences_list.append(sent.strip()+" "+end)
    return ' '.join(sentences_list)

def convert_review_as_sentence(review):
    """ convert one review entry into a list of sentences,
        only for hierarchical attention model
    """
    return [sent.strip() for sent in review.split(" END") if sent != '' and sent !=' ']

def convert_reviews_to_sentences(texts):
    """ return a list of entries, each one is a list of sentences, 
        only for hierarchical attention model
    """
    return [convert_review_as_sentence(text) for text in texts]

In [None]:
import os 

max_num_words = 20000      # max number of words to take in a corpus
max_sequence_length = 100    # corpus max sentence length 2043
embedding_dim = 100


# need to build index mapping words to their embeddings 
# embeddings_index[word] = coefficient vector as np.array

def get_embedding_dict(file_name='glove.6B.100d.txt', glove_dir='glove.6B/', base_dir=''):
    """ construct an embedding dict, 
        each item is a word to its embedding array
    """
    embeddings_index = {}  
    with open(os.path.join(base_dir+glove_dir, file_name)) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index


def get_tokenizer(filtered_texts, max_num_words):
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(filtered_texts)
    return tokenizer

### extra data processing in hierarchical attention model 

In [None]:
def get_sequence_of_sentences(filtered_texts_sentences, tokenizer):
    """ reconstruct into entries where each entry is a list of sentence, each sentence is a list of word indices
        This part is only for hierarchical attention model 
    """
    return [tokenizer.texts_to_sequences(sent_list) for sent_list in filtered_texts_sentences]

def get_sentences_max_num(sequences_sentences):
    """ get the max number of sentences per sample in the corpus, 
        This part is only for hierarchical attention model 
    """
    max_sentence_length = 0
    for seq in sequences_sentences:
        if len(seq) > max_sentence_length:
            max_sentence_length = len(seq)
    # print(max_sentence_length)
    return max_sentence_length
    
def pad_sequences_in_list_of_sentence_lists(sequences_sentences):
    """ pad sequences within sentence lists,
        This part is only for hierarchical attention model 
    """
    return [pad_sequences(index_list, maxlen=max_sequence_length) for index_list in sequences_sentences]

def pad_sentences(data_sentences, sentence_max_num, max_length=max_sequence_length):
    """ pad the data in a sentence level,
        This part is only for hierarchical attention model 
    """
    data_sentences_padded = []
    for data_sentence in data_sentences:
        if len(data_sentence) < sentence_max_num:
            data_sentences_padded.append(np.concatenate(([[0]*max_length]*(max_length-len(data_sentence)), data_sentence), axis=0))
    return data_sentences_padded

### end for extra data processing in hierarchical attention model 

In [None]:
import pickle 

def split_train_test(data, labels, test_split=0.1):
    """ split the data into a training (included validation) and a test set
        x: corpus in indices
        y: raw scores as labels (not one-hot encoded yet) 
    """
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    num_test_samples = int(test_split * data.shape[0])

    x_train = data[:-num_test_samples]
    y_train = labels[:-num_test_samples]
    x_test = data[-num_test_samples:]
    y_test = labels[-num_test_samples:]
    return x_train, y_train, x_test, y_test

def get_embedding_matrix(word_index, embeddings_index, max_num_words=20000, embedding_dim=100):
    """ prepare embedding matrix to build the embedding layer 
    """
    num_words = min(max_num_words, len(word_index))
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, i in word_index.items():
        if i >= max_num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

## main code

In [None]:
if __name__ == '__main__':
    """ by default, Amazon Shoes Reviews data will be processed and saved  
    """
    
    max_num_words = 20000      # max number of words to take in a corpus
    max_sequence_length = 100    # corpus max sentence length 2043
    embedding_dim = 100
    
    load_path = 'Shoes.txt'
    save_path = 'shoes_list_of_review_dicts.pkl'
    
    reviews_list = build_list_of_review_dict(load_path)
    save_review_dict_list(save_path, reviews_list)

    # load the data 
    # data = load_review_dic_list(save_path)
        
    texts, scores = get_text_and_score_lists(reviews_list)
    
    # get the majority predictor as baseline
    scores_dict, count = get_sample_score_distribution(scores)
    
    # load embedding index from glove file
    embeddings_index = get_embedding_dict(file_name='glove.6B.100d.txt', glove_dir='glove.6B/', base_dir='')

    
    filtered_texts = [filter_text(text) for text in texts]
    labels = scores
    # labels = score_to_categorical(scores)   # maybe not now???

    # this part of data construction is only for hierarchical attention model
    # filtered_texts_sentences = convert_reviews_to_sentences(filtered_texts)
    
    # indexing and embedding 
    # get tokenizer for mapping words to indices based on given corpus 
    tokenizer = get_tokenizer(filtered_texts, max_num_words)
    # convert words to indices in each sentence 
    sequences = tokenizer.texts_to_sequences(filtered_texts)
    # get a dict of word: index
    word_index = tokenizer.word_index
    # pad index sequences with 0s 
    data = pad_sequences(sequences, maxlen=max_sequence_length)
    
    # for hierarchical attention model only
    # sequences_sentences = get_sequence_of_sentences(filtered_texts_sentences)
    # sentence_max_num = get_sentences_max_num(sequences_sentences)
    # data_sentences = pad_sequences_in_list_of_sentence_lists(sequences_sentences)
    # data_sentences_padded = pad_sentences(data_sentences, sentence_max_num)

    # split the dataset -> train, test
    x_train, y_train, x_test, y_test = split_train_test(data, labels)
    embedding_matrix = get_embedding_matrix(word_index, embeddings_index)
    
    # save the processed data 
    # raw processed data:
    processed_data ={
        'texts': filtered_texts,
        'labels': labels,
        'scores_dict':scores_dict,
        'count': count, 
        'embeddings_index': embeddings_index
    }
    with open('raw_processed_data.pkl', 'wb') as f:
        pickle.dump(processed_data, f)
        
    # indexed and embedded data
    np.savez("data_and_embedding100",
             max_num_words=max_num_words
             embedding_dim=embedding_dim,
             max_sequence_length=max_sequence_length,
             x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test,
             word_index=word_index,
             embedding_matrix=embedding_matrix)
    
    # for hierarchical attention model only
    '''
    np.savez("data_and_embedding100_hierarchical",
             max_num_words=max_num_words
             embedding_dim=embedding_dim,
             max_sequence_length=max_sequence_length,
             x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test,
             word_index=word_index
             embedding_matrix=embedding_matrix,
             sentence_max_num=sentence_max_num,
             data_sentences_padded=data_sentences_padded)
    '''