## 1. Packages

In [33]:
import os, json
import numpy as np, pandas as pd
import argparse
import contractions
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re
from tqdm import tqdm
from p_tqdm import p_map
import multiprocessing as mp
from multiprocessing import Process, Pool
import string

# nltk.download('averaged_perceptron_tagger')
en_stop_words = set(stopwords.words('english'))

## 2. Arguments

In [34]:
parser = {
    "data_path": "amazon_reviews"
}
args = argparse.Namespace(**parser)

In [35]:
# parser = argparse.ArgumentParser()
# parser.add_argument("--data_path", help="path to where you save the amazon files.")
# args, unknown = parser.parse_known_args()

## 3. Read Line-delimited JSON

[The Amazon product data](http://jmcauley.ucsd.edu/data/amazon/) is saved as multiple line-delimited json files.

I will read all the datasets in one time and then add a "category" column for each sample, since the file is stored based on the product category.

In [36]:
def read_line_json(path, name_list):
    json_contents = []
    for file_name in name_list:
        with open(os.path.join(path, file_name)) as file:
            for i, line in enumerate(file):
                json_dict = json.loads(line)
                json_dict["category"] = file_name[8:-7] # add a column denoting the category
                json_contents.append(json_dict)
    return json_contents

In [37]:
amazon_lists = [name for name in os.listdir(args.data_path) if name[-5:] == ".json"] # ./amazon_reviews

''' delete the following line if reading all files '''
amazon_lists = [amazon_lists[0]]

json_contents = read_line_json(args.data_path, amazon_lists)

## 4. JSON to DataFrame

Let's convert the JSON data into a DataFrame.

In [38]:
def json_to_df(selected_cols, json_data):
    data = pd.DataFrame(json_contents).loc[:, cols]
    '1'' Remove duplicated items if existing... '''
    # data.sort_values('asin').drop_duplicates(subset=['reviewerID','reviewText','unixReviewTime','summary','category'],keep='first',inplace=False)
    ''' Save the DataFrame into a csv file if needed... '''
    # data.to_csv()
    return data

In [39]:
# The columns I want to keep:
cols = ["reviewerID", "asin", "reviewText", "overall", "summary", "unixReviewTime", "category"]

df = json_to_df(selected_cols=cols, json_data=json_contents)

## 5. Preprocess reviewText and sumary

Follow the below preprocessing step in order to clean the reviews.

1. Remove HTML tags
2. Remove url
3. Remove emoji
4. decontracted
5. Remove punctuations and special characters
6. Implement Stemming or Lemmatization
7. Remove stop words and transform to the lower Case

### 5.0 NA value

In [None]:
def to_empty(texts):
    # represent NaN as ""
    if pd.isnull(texts) or texts == "":
        return ""
    else:
        return texts

### 5.1. HTML tag

In [42]:
def remove_tag(texts):
    return BeautifulSoup(texts,'lxml').get_text()

### 5.2. URL

In [43]:
def remove_url(texts):
    return re.sub(r"http\S+", " ", texts) # \S non-space characters

### 5.3. Emoji

Remove Emoji if any exists...

Note: Emoji could be used to recognize the sentiment, potentially...

In [44]:
def remove_emoji(texts):
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', texts)

In [45]:
def find_emoji(texts):
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return bool(re.match(emoji_pattern, texts))

### 5.4. Contraction

Example: I've -> I have.

In [46]:
def decontracted(texts):
    return contractions.fix(texts)

### 5.5. Punctuation; Speical Characters

In [47]:
punctuations = string.punctuation
def remove_punc(texts):
    return " ".join("".join([" " if ch in punctuations or not ch.isalpha() else ch for ch in texts]).split())

### 5.6. Stemming or Lemmatization

POS_tag is helpful for using text lemmatization.

In [48]:
# POS_tag
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [49]:
def to_lemma(texts):
    # tokenized and pos tag
    tokens = word_tokenize(texts)
    tagged_sent = pos_tag(tokens)
    
    # lemma
    wnl = WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))

    return lemmas_sent


### 5.7. Lower Case; Stop Words

Considering some negative words which could affect the sentiment analysis, I would like to remove them from the stop word list.

In [58]:
neg_set = {"no", "nor", "not"}
my_en_stop_words = {w for w in en_stop_words if w not in neg_set}

In [77]:
def clean_tokens(tokens):
    # remove stop words and then transform to lower case
    return [w.lower() for w in tokens if w.lower() not in my_en_stop_words]

### 5.8 Preprocessing Function

Let's integrate all the functions above to extract cleaned tokens.

In [78]:
def full_step_preprocessing(texts):
    # this function will do preprocessing on a string and then return a list of tokens
    token_result = clean_tokens(
                        to_lemma(
                            remove_punc(
                                decontracted(
                                    remove_emoji(
                                        remove_url(
                                            remove_tag(
                                                to_empty(
                                                    texts
                                                )
                                            )
                                        )
                                    )
                                )
                            )
                        )
                    )
                    
    return token_result

In [None]:
def simple_preprocessing(texts):
    # this function will do preprocessing on a string and then return a list of tokens
    token_result = decontracted(
                        remove_emoji(
                            remove_url(
                                remove_tag(
                                    to_empty(
                                        texts
                                    )
                                )
                            )
                        )
                    )
    return token_result

## 6. Implement Preprocessing with multi-processing

In [79]:
df["reviewTokens"] = p_map(preprocess_func, df["reviewText"])

HBox(children=(FloatProgress(value=0.0, max=13272.0), HTML(value='')))




In [80]:
df.to_json(os.path.join(args.data_path, "amazon_reviews.json"), orient="columns")