## 1. Packages & Arguments

In [1]:
# set root
import os
os.chdir("/scratch/kll482/cathay/")
import sys
sys.path.append("/scratch/kll482/cathay/")

In [12]:
import json
from configparser import ConfigParser
import numpy as np, pandas as pd
import time
from p_tqdm import p_map
from tqdm import tqdm_notebook as tqdm
# from tqdm import tqdm
from multiprocessing import Pool
import multiprocessing as mp
from transformers import BertTokenizer, BertModel
import nltk
from nltk import word_tokenize
import torch
from bert_embedding import BertEmbedding

''' customized modules '''
from src.preprocessing.text_cleaning.text_cleaning import *

In [3]:
# import argparse
# parser = {
#     "data_path": "amazon_reviews",
# }
# args = argparse.Namespace(**parser)

In [4]:
config = ConfigParser()
config.read("config/config.ini")
print(config.sections())
args = config["feature_engineering"]

['text_cleaning', 'feature_engineering', 'graph_models']


## 2-1. Read Line-delimited JSON

[The Amazon product data](http://jmcauley.ucsd.edu/data/amazon/) is saved as multiple line-delimited json files.

I will read all the datasets in one time and then add a "category" column for each sample, since the file is stored based on the product category.

In [5]:
def read_line_json(path, name_list):
    json_contents = []
    for file_name in name_list:
        with open(os.path.join(path, file_name)) as file:
            for i, line in enumerate(file):
                json_dict = json.loads(line)
                json_dict["category"] = file_name[8:-7] # add a column denoting the category
                json_contents.append(json_dict)
    return json_contents

In [6]:
folder_path = args["amazon_file_path"]
file_lists = [name for name in os.listdir(folder_path) if name[-5:] == ".json"] # ./amazon_reviews
print(file_lists)

['Video_Games_5.json', 'Musical_Instruments_5.json']


In [7]:
''' delete the next line if reading all files '''
# file_lists = [file_lists[1]]
json_contents = read_line_json(folder_path, file_lists)

Let's convert data in JSON format to a DataFrame.

In [8]:
def json_to_df(json_data, selected_cols=None):
    if selected_cols is None:
        data = pd.DataFrame(json_contents)
    else:
        data = pd.DataFrame(json_contents).loc[:, cols]
    '1'' Remove duplicated items if existing... '''
    # data.sort_values('asin').drop_duplicates(subset=['reviewerID','reviewText','unixReviewTime','summary','category'],keep='first',inplace=False)
    ''' Save the DataFrame into a csv file if needed... '''
    # data.to_csv()
    return data

In [9]:
# df = json_to_df(json_data=json_contents)

In [10]:
# The columns I want to keep:
cols = ["reviewerID", "asin", "reviewText", "overall", "summary", "unixReviewTime", "category"]
df = json_to_df(json_data=json_contents, selected_cols=cols)

In [11]:
print(len(df))

728969


## Test

In [None]:
[i for i, v in enumerate(df["reviewText"]) if not pd.isnull(v) and len(remove_tag(v)) != len(v)]



In [None]:
[i for i, v in enumerate(df["reviewText"]) if not pd.isnull(v) and len(remove_tag(v)) != len(v)]

## 2-2. Preprocessing Texts

Then, we should do some text cleaning first.

In [11]:
# for idx, row in enumerate(df["reviewText"]):
#     result = []
#     try:
#         result.append(full_step_preprocessing(row))
#     except:
#         print(idx)
#         break

In [8]:
def get_cleaned_tokens(df, review_col_name="reviewText", token_col_name="reviewTokens", cpu_number=4):
    pool = mp.Pool(cpu_number)
    df[token_col_name] = pool.map(full_step_preprocessing, tqdm(df[review_col_name]))
    pool.close()
    pool.join()
    return df

In [9]:
def remove_empty_tokens(df, token_col_name="reviewTokens"):
    empty_row_index = list(df[token_col_name][df[token_col_name].apply(lambda x: len(x)==0)].index)
    df = df.drop(axis=0, index=empty_row_index).reset_index(drop=True)
    
    assert sum(df[token_col_name].apply(lambda x: len(x)==0)) == 0
    
    return df

In [13]:
df = get_cleaned_tokens(df, "reviewText", "reviewTokens", cpu_number=32)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=728969.0), HTML(value='')))








In [14]:
df = remove_empty_tokens(df, "reviewTokens")

## 3. Edge Index & Unique Tokens

The function is to convert tokens of a reivews to an adjacency matrix based on n-gram.

In [None]:
# def get_adjacency_matrix(tokens, num_neighbor=2):
#     # initialize
#     unique_vocabulary = set(tokens) 
#     vocabulary_dict = {value: index for index, value in enumerate(unique_vocabulary)}
#     width = height = len(unique_vocabulary)
#     adjacency_matrix = [[0]*width for _ in range(height)]
    
#     edge_start = []
#     edge_end = []
#     # insert value into the adjacency matrix
#     for token_index, token in enumerate(tokens):
#         matrix_index = vocabulary_dict[token]
#         for p in range(1, num_neighbor+1):
#             if token_index-p >= 0: # if previous tokens exist
#                 prev_matrix_index = vocabulary_dict[tokens[token_index-p]]
#                 adjacency_matrix[matrix_index][prev_matrix_index] = 1 # future work: if duplicated edges exist...
#                 adjacency_matrix[prev_matrix_index][matrix_index] = 1
                
#                 edge_start += [matrix_index, prev_matrix_index]
#                 edge_end += [prev_matrix_index, matrix_index]
                
#             elif token_index+p < len(tokens): # if next tokes exist
#                 next_matrix_index = vocabulary_dict[tokens[token_index+p]] # get the token index in the adjacency matrix
#                 adjacency_matrix[matrix_index][next_matrix_index] = 1
#                 adjacency_matrix[next_matrix_index][matrix_index] = 1
                
#                 edge_start += [matrix_index, next_matrix_index]
#                 edge_end += [next_matrix_index, matrix_index]
    
#     unique_tokens = list(vocabulary_dict.keys())
#     edge_index = [edge_start, edge_end]
#     return adjacency_matrix, unique_tokens, edge_index



In [15]:
def get_edge_index(tokens, unique_vocabulary, num_neighbor=2, bidirection=True):
    # initialize
#     unique_vocabulary = set(tokens) 
    vocabulary_dict = {value: index for index, value in enumerate(unique_vocabulary)} # dictionary of unique tokens
    edge_start = []
    edge_end = []
    
    # build edge index
    for token_index, token in enumerate(tokens):
        curr_index = vocabulary_dict[token] # current token's index in vocabulary_dict
        
        for p in range(1, num_neighbor+1): # find neighbors of current tokens
            if bidirection == True: # bi-direction
                if token_index-p >= 0: # if previous p token exists
                    prev_index = vocabulary_dict[tokens[token_index-p]] # get the index of the previous p token
                    edge_start += [curr_index, prev_index] # undirected
                    edge_end += [prev_index, curr_index]
                
            if token_index+p < len(tokens): # if next p toke exists
                next_index = vocabulary_dict[tokens[token_index+p]] # get the index of the next p token   
                edge_start += [curr_index, next_index]
                edge_end += [next_index, curr_index]
    
    edge_index = [edge_start, edge_end]
    return edge_index


In [16]:
# get unique tokens
df["uniqueTokens"] = df["reviewTokens"].apply(lambda row: list(set(row)))

In [17]:
# edge index
edge_index_names = []
for neighbor in tqdm([1]):
    # get edge index with n neighbors
    edge_index = df.apply(lambda row: get_edge_index(tokens=row["reviewTokens"],
                                                     unique_vocabulary=row["uniqueTokens"],
                                                     num_neighbor=neighbor,
                                                     bidirection=False
                                                    ),
                          axis=1
                         )
    df["edgeIndex{}".format(neighbor)] = edge_index # insert edge indices to the dataframe
    edge_index_names.append("edgeIndex{}".format(neighbor))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
edge_index_names = ["edgeIndex2", "edgeIndex3", "edgeIndex4"]

In [None]:
df.loc[:, ["overall", "reviewTokens", "uniqueTokens"]+edge_index_names].to_json("dataset/full_dataset/partial_features.json", orient="columns")

In [24]:
''' 2. handle and save each file respectively '''
CPU_COUNT = 28
for file in [file_lists[1]]:
    print("=== preprocessing file {} ===".format(file))
    ''' 2-1. json to dataframe '''
    print("json to dataframe...")
    json_contents = read_line_json(folder_path, [file]) # reading the json files
    cols = ["reviewerID", "asin", "reviewText", "overall", "summary", "unixReviewTime", "category"] # the columns I want to keep:
    df = json_to_df(json_data=json_contents, selected_cols=cols)

    ''' 2-2. do text cleaning '''
    print("cleaning texts...")
    df = get_cleaned_tokens(df, "reviewText", "reviewTokens", cpu_number=CPU_COUNT)
    df = remove_empty_tokens(df, "reviewTokens")

    ''' 2-3. get unique tokens from review tokens '''
    print("extract unique tokens...")
    df["uniqueTokens"] = df["reviewTokens"].apply(lambda row: list(set(row)))

    ''' 2-4. get edge index from unique tokens '''
    print("getting the edge index for each graph...")
    # edge index
    edge_index_names = []
    neighbors = [1]
    bidirection_or_not = False
    for neighbor in tqdm(neighbors):
        # get edge index with n neighbors
        edge_index = df.apply(lambda row: get_edge_index(tokens=row["reviewTokens"],
                                                         unique_vocabulary=row["uniqueTokens"],
                                                         num_neighbor=neighbor,
                                                         bidirection=bidirection_or_not,
                                                        ),
                              axis=1
                             )
        df["edgeIndex{}".format(neighbor)] = edge_index # insert edge indices to the dataframe
        edge_index_names.append("edgeIndex{}".format(neighbor))
        
    ''' 2-5. save df as a new json file '''
    print("saving jsoin files...")
    edge_index_names = ["edgeIndex{}".format(n) for n in neighbors]
    df.loc[:, ["overall", "reviewText", "reviewTokens", "uniqueTokens"]+edge_index_names].to_json("dataset/processed_dataset/{}".format("straight_"+file), orient="columns")
    print("finish!")

=== preprocessing file Musical_Instruments_5.json ===
json to dataframe...
cleaning texts...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=231392.0), HTML(value='')))








extract unique tokens...
getting the edge index for each graph...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


saving jsoin files...
finish!


---

## 3. Embedding

# 3.0. Token Length Visualization

In [None]:
# pass

### 3.1. GloVe

In [None]:
# pass

### 3.2. BERT Pretrained Embeddings

In [None]:
class BertEmbedding:
    def __init__(self, max_len=None):
        self.max_len = max_len
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.embedding_matrix = self.get_bert_embed_matrix()
        
    def get_bert_embed_matrix(self):
        bert = BertModel.from_pretrained('bert-base-uncased')
        bert_embeddings = list(bert.children())[0]
        bert_word_embeddings = list(bert_embeddings.children())[0]
        mat = bert_word_embeddings.weight.data.numpy()
        return mat

    def get_embeddings(self, row_data):
        '''
        @ param, row_data: a unique token list
        '''
        if len(row_data) == 0:
            return []
        
        if self.max_len is None:
            MAX_LEN = len(row_data)+2 # +2 is for adding cls and \cls
        else:
            MAX_LEN = self.max_len
            
        input_ids = self.tokenizer.encode(row_data,
                                          max_length=MAX_LEN,
                                          truncation=True,
                                          pad_to_max_length=True
                                         )
        input_ids = input_ids[1:-1] # however, we do not take cls & \cls into consideration when building the embeddings
        embeddings = []
        for index in input_ids:
            embeddings.append(self.embedding_matrix[index])

        assert len(embeddings) == len(input_ids) and len(embeddings[0]) == self.embedding_matrix.shape[1]
        return embeddings 

First, I will create a BERT embedding matrix for unique word list which will be used in Graph models.

In [None]:
unique_tokens = df["uniqueTokens"].values.tolist()
bertembeddings = BertEmbedding()
df["graphEmbeddings"] = [bertembeddings.get_embeddings(row) for row in tqdm(unique_tokens)]

In [None]:
# # initial
# with Pool(8) as p:
#     unique_tokens = df["uniqueTokens"].values.tolist()
# #     MAX_LEN = int(np.median([len(row) for row in df["uniqueTokens"]])) # median_unique_word_length
# #     bertembeddings = BertEmbedding(MAX_LEN)
#     bertembeddings = BertEmbedding()
#     df["graphEmbeddings"] = p.map(bertembeddings.get_embeddings, tqdm_notebook(unique_tokens))

# # df["graphEmbeddings"] = graph_embeddings

Second, I will also create another embedding matrix for language models.

In [None]:
with Pool(8) as p:
    review_tokens = df["reviewTokens"].values.tolist()
#     MAX_LEN = int(np.median([len(row) for row in df["reviewTokens"]])) # median_review_length
#     bertembeddings = BertEmbedding(MAX_LEN)
    bertembeddings = BertEmbedding()
    language_embeddings = p.map(bertembeddings.get_embeddings, tqdm(review_tokens))
    p.terminate()
    
df["languageEmbeddings"] = language_embeddings

### 3.3. Random Embedding

PyTorch will automatically generate random embeddings for us if we do not insert embeddings into the model.

## 4. Save Data

In [None]:
# df.loc[:, ["overall", "edgeIndex", "graphEmbeddings"]].to_pickle("dataset/full_dataset/modeling_features.pkl")
df.loc[:, ["overall", "edgeIndex", "graphEmbeddings"]].to_json(args["modeling_feature_path"], orient="columns")
# df.to_pickle(
#     os.path.join(
#         args["data_path"],
#         "amazon_features.pkl"
#     )
# )

## Knowledge Graph

In [18]:
!pip install spacy
!python3 -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-2.3.2-cp37-cp37m-manylinux1_x86_64.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 15.5 MB/s eta 0:00:01
Collecting catalogue<1.1.0,>=0.0.7
  Downloading catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting plac<1.2.0,>=0.9.6
  Downloading plac-1.1.3-py2.py3-none-any.whl (20 kB)
Collecting wasabi<1.1.0,>=0.4.0
  Downloading wasabi-0.7.1.tar.gz (22 kB)
Collecting thinc==7.4.1
  Downloading thinc-7.4.1-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 119.1 MB/s eta 0:00:01
[?25hCollecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.2-cp37-cp37m-manylinux1_x86_64.whl (118 kB)
[K     |████████████████████████████████| 118 kB 121.2 MB/s eta 0:00:01
[?25hCollecting srsly<1.1.0,>=1.0.2
  Downloading srsly-1.0.2-cp37-cp37m-manylinux1_x86_64.whl (185 kB)
[K     |████████████████████████████████| 185 kB 122.4 MB/s eta 0:00:01
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.3-cp37-cp3

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
