## 1. Packages & Arguments

In [3]:
# set root
import os
os.chdir("/scratch/kll482/cathay/")
import sys
sys.path.append("/scratch/kll482/cathay/")

In [4]:
import json
from configparser import ConfigParser
import numpy as np, pandas as pd
import time
from p_tqdm import p_map
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Pool
import multiprocessing as mp
from transformers import BertTokenizer, BertModel
import nltk
from nltk import word_tokenize
import torch

''' customized modules '''
from src.preprocessing.text_cleaning.text_cleaning import full_step_preprocessing, simple_preprocessing

In [5]:
# import argparse
# parser = {
#     "data_path": "amazon_reviews",
# }
# args = argparse.Namespace(**parser)

In [6]:
config = ConfigParser()
config.read("config/config.ini")
print(config.sections())
args = config["feature_engineering"]

['text_cleaning', 'feature_engineering', 'graph_models']


## 2. Read Line-delimited JSON

[The Amazon product data](http://jmcauley.ucsd.edu/data/amazon/) is saved as multiple line-delimited json files.

I will read all the datasets in one time and then add a "category" column for each sample, since the file is stored based on the product category.

In [7]:
def read_line_json(path, name_list):
    json_contents = []
    for file_name in name_list:
        with open(os.path.join(path, file_name)) as file:
            for i, line in enumerate(file):
                json_dict = json.loads(line)
                json_dict["category"] = file_name[8:-7] # add a column denoting the category
                json_contents.append(json_dict)
    return json_contents

In [8]:
folder_path = args["amazon_file_path"]
file_lists = [name for name in os.listdir(folder_path) if name[-5:] == ".json"] # ./amazon_reviews

''' delete the next line if reading all files '''
file_lists = [file_lists[0]]
json_contents = read_line_json(folder_path, file_lists)

Let's convert data in JSON format to a DataFrame.

In [9]:
def json_to_df(selected_cols, json_data):
    data = pd.DataFrame(json_contents).loc[:, cols]
    '1'' Remove duplicated items if existing... '''
    # data.sort_values('asin').drop_duplicates(subset=['reviewerID','reviewText','unixReviewTime','summary','category'],keep='first',inplace=False)
    ''' Save the DataFrame into a csv file if needed... '''
    # data.to_csv()
    return data

In [14]:
# The columns I want to keep:
cols = ["reviewerID", "asin", "reviewText", "overall", "summary", "unixReviewTime", "category"]
df = json_to_df(selected_cols=cols, json_data=json_contents)

Then, we should do some text cleaning first.

In [12]:
def get_cleaned_tokens(df, review_col_name="reviewText", token_col_name="reviewTokens", cpu_number=4):
    print("cleaning the reviews...")
    pool = mp.Pool(cpu_number)
    df[token_col_name] = pool.map(full_step_preprocessing, tqdm_notebook(df[review_col_name]))
    pool.close()
    pool.join()
    return df

In [None]:
df = get_cleaned_tokens(df, "reviewText", "reviewTokens", cpu_number=20)

# with open(os.path.join(args.data_path, args.file_name), "r+") as file:
#     json_data = json.load(file)
# df = pd.DataFrame(json_data)b

In [None]:
def remove_empty_tokens(df, token_col_name="reviewTokens"):
    empty_row_index = list(df[token_col_name][df[token_col_name].apply(lambda x: len(x)==0)].index)
    df = df.drop(axis=0, index=empty_row_index).reset_index(drop=True)
    
    assert sum(df[token_col_name].apply(lambda x: len(x)==0)) == 0
    
    return df


In [None]:
df = remove_empty_tokens(df, "reviewTokens")

## 3. Adjacency & Edge Index

The function is to convert tokens of a reivews to an adjacency matrix based on n-gram.

In [None]:
# def get_adjacency_matrix(tokens, num_neighbor=2):
#     # initialize
#     unique_vocabulary = set(tokens) 
#     vocabulary_dict = {value: index for index, value in enumerate(unique_vocabulary)}
#     width = height = len(unique_vocabulary)
#     adjacency_matrix = [[0]*width for _ in range(height)]
    
#     edge_start = []
#     edge_end = []
#     # insert value into the adjacency matrix
#     for token_index, token in enumerate(tokens):
#         matrix_index = vocabulary_dict[token]
#         for p in range(1, num_neighbor+1):
#             if token_index-p >= 0: # if previous tokens exist
#                 prev_matrix_index = vocabulary_dict[tokens[token_index-p]]
#                 adjacency_matrix[matrix_index][prev_matrix_index] = 1 # future work: if duplicated edges exist...
#                 adjacency_matrix[prev_matrix_index][matrix_index] = 1
                
#                 edge_start += [matrix_index, prev_matrix_index]
#                 edge_end += [prev_matrix_index, matrix_index]
                
#             elif token_index+p < len(tokens): # if next tokes exist
#                 next_matrix_index = vocabulary_dict[tokens[token_index+p]] # get the token index in the adjacency matrix
#                 adjacency_matrix[matrix_index][next_matrix_index] = 1
#                 adjacency_matrix[next_matrix_index][matrix_index] = 1
                
#                 edge_start += [matrix_index, next_matrix_index]
#                 edge_end += [next_matrix_index, matrix_index]
    
#     unique_tokens = list(vocabulary_dict.keys())
#     edge_index = [edge_start, edge_end]
#     return adjacency_matrix, unique_tokens, edge_index



In [None]:
def get_edge_index(tokens, num_neighbor=2):
    # initialize
    unique_vocabulary = set(tokens) 
    vocabulary_dict = {value: index for index, value in enumerate(unique_vocabulary)} # dictionary of unique tokens
    edge_start = []
    edge_end = []
    
    # build edge index
    for token_index, token in enumerate(tokens):
        curr_index = vocabulary_dict[token] # current token's index in vocabulary_dict
        
        for p in range(1, num_neighbor+1): # find neighbors of current tokens
            if token_index-p >= 0: # if previous p token exists
                prev_index = vocabulary_dict[tokens[token_index-p]] # get the index of the previous p token
                edge_start += [curr_index, prev_index] # undirected
                edge_end += [prev_index, curr_index]
                
            if token_index+p < len(tokens): # if next p toke exists
                next_index = vocabulary_dict[tokens[token_index+p]] # get the index of the next p token   
                edge_start += [curr_index, next_index]
                edge_end += [next_index, curr_index]
    
    unique_tokens = list(vocabulary_dict.keys())
    edge_index = [edge_start, edge_end]
    return edge_index, unique_tokens


In [None]:
# edge index
num_neighbor = 2
edge_index_info = df["reviewTokens"].apply(lambda row: get_edge_index(row, num_neighbor))

In [None]:
# 1. insert edge index to the dataframe
df["edgeIndex"] = [row[0] for row in edge_index_info]

# 2. insert unique token to the dataframe
df["uniqueTokens"] = [row[1] for row in edge_index_info]

## 3. Embedding

# 3.0. Token Length Visualization

In [None]:
# pass

### 3.1. GloVe

In [None]:
# pass

### 3.2. BERT Pretrained Embeddings

In [None]:
class BertEmbedding:
    def __init__(self, max_len=None):
        self.max_len = max_len
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.embedding_matrix = self.get_bert_embed_matrix()
        
    def get_bert_embed_matrix(self):
        bert = BertModel.from_pretrained('bert-base-uncased')
        bert_embeddings = list(bert.children())[0]
        bert_word_embeddings = list(bert_embeddings.children())[0]
        mat = bert_word_embeddings.weight.data.numpy()
        return mat

    def get_embeddings(self, row_data):
        '''
        @ param, row_data: a unique token list
        '''
        if len(row_data) == 0:
            return []
        
        if self.max_len is None:
            MAX_LEN = len(row_data)+2 # +2 is for adding cls and \cls
        else:
            MAX_LEN = self.max_len
            
        input_ids = self.tokenizer.encode(row_data,
                                          max_length=MAX_LEN,
                                          truncation=True,
                                          pad_to_max_length=True
                                         )
        input_ids = input_ids[1:-1] # however, we do not take cls & \cls into consideration when building the embeddings
        embeddings = []
        for index in input_ids:
            embeddings.append(self.embedding_matrix[index])

        assert len(embeddings) == len(input_ids) and len(embeddings[0]) == self.embedding_matrix.shape[1]
        return embeddings 

First, I will create a BERT embedding matrix for unique word list which will be used in Graph models.

In [None]:
unique_tokens = df["uniqueTokens"].values.tolist()
bertembeddings = BertEmbedding()
df["graphEmbeddings"] = [bertembeddings.get_embeddings(row) for row in tqdm_notebook(unique_tokens)]

In [24]:
# # initial
# with Pool(8) as p:
#     unique_tokens = df["uniqueTokens"].values.tolist()
# #     MAX_LEN = int(np.median([len(row) for row in df["uniqueTokens"]])) # median_unique_word_length
# #     bertembeddings = BertEmbedding(MAX_LEN)
#     bertembeddings = BertEmbedding()
#     df["graphEmbeddings"] = p.map(bertembeddings.get_embeddings, tqdm_notebook(unique_tokens))

# # df["graphEmbeddings"] = graph_embeddings

Second, I will also create another embedding matrix for language models.

In [None]:
with Pool(8) as p:
    review_tokens = df["reviewTokens"].values.tolist()
#     MAX_LEN = int(np.median([len(row) for row in df["reviewTokens"]])) # median_review_length
#     bertembeddings = BertEmbedding(MAX_LEN)
    bertembeddings = BertEmbedding()
    language_embeddings = p.map(bertembeddings.get_embeddings, tqdm_notebook(review_tokens))
    p.terminate()
    
df["languageEmbeddings"] = language_embeddings

### 3.3. Random Embedding

PyTorch will automatically generate random embeddings for us if we do not insert embeddings into the model.

## 4. Save Data

In [None]:
# df.loc[:, ["overall", "edgeIndex", "graphEmbeddings"]].to_pickle("dataset/full_dataset/modeling_features.pkl")
df.loc[:, ["overall", "edgeIndex", "graphEmbeddings"]].to_json(args["modeling_feature_path"], orient="columns")
# df.to_pickle(
#     os.path.join(
#         args["data_path"],
#         "amazon_features.pkl"
#     )
# )