In [1]:
import os
import time
from IPython.display import display

import pandas as pd
import nltk
import re
import contractions

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [3]:
def text_preprocessing(serie, stop_words=True, lemmatization=True):
    # lowercase
    serie = serie.map(lambda x: x.lower())   

    # remove extra newlines
    serie = serie.map(lambda x: re.sub(r'[\r|\n|\r\n]+', ' ', x))

    # remove @tag
    serie = serie.map(lambda x: re.sub(r'@[\S]+', '', x))

    # remove URL
    serie = serie.map(lambda x: re.sub('https?://[\S]+', '', x))

    # remove contractions
    serie = serie.map(lambda x: contractions.fix(x).lower())

    # remove hashtag and numbers
    serie = serie.map(lambda x: re.sub("[^a-zA-Z]", " ", x))

    # tokenization
    serie = serie.map(word_tokenize)

    if stop_words:        
        # remove stop words
        stop_words = set(stopwords.words('english'))
        serie = serie.map(lambda x: [word for word in x if word not in stop_words])
    
    if lemmatization:
        # lemmatization    
        serie = serie.map(nltk.tag.pos_tag)
        serie = serie.map(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
        wordnet_lemmatizer = WordNetLemmatizer()
        serie = serie.map(lambda x: [wordnet_lemmatizer.lemmatize(word, tag) for (word, tag) in x])
    
    serie = serie.map(lambda x: ' '.join(word for word in x))

    return serie

In [4]:
pd.set_option('max_colwidth', 400)

text = "i agreed. this was useful. democrats should definitely not vote for joe biden in the primary."
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        print(text_preprocessing(pd.Series([text]), stop_words=stop_words, lemmatization=lemmatization))

0    i agreed this was useful democrats should definitely not vote for joe biden in the primary
dtype: object
0    i agree this be useful democrat should definitely not vote for joe biden in the primary
dtype: object
0    agreed useful democrats definitely vote joe biden primary
dtype: object
0    agree useful democrat definitely vote joe biden primary
dtype: object


In [5]:
data = pd.read_json('data/AMAZON_FASHION.json', lines=True)
data = data[['overall', 'reviewerID', 'asin', 'reviewText', 'summary']]
data

Unnamed: 0,overall,reviewerID,asin,reviewText,summary
0,5,A1D4G1SNUZWQOT,7106116521,Exactly what I needed.,perfect replacements!!
1,2,A3DDWDH9PX2YX2,7106116521,"I agree with the other review, the opening is too small. I almost bent the hook on some very expensive earrings trying to get these up higher than just the end so they're not seen. Would not buy again but for the price, not sending back.","I agree with the other review, the opening is ..."
2,4,A2MWC41EW7XL15,7106116521,Love these... I am going to order another pack to keep in work; someone (including myself) is always losing the back to an earring. I don't understand why all fish hook earrings don't have them. Just wish that they were a tiny bit longer. :),My New 'Friends' !!
3,2,A2UH2QQ275NV45,7106116521,too tiny an opening,Two Stars
4,3,A89F3LQADZBS5,7106116521,Okay,Three Stars
...,...,...,...,...,...
883631,5,A1ZSB2Q144UTEY,B01HJHTH5U,"I absolutely love this dress!! It's sexy and comfortable. The split up the back was too much for me, so I had to sew it about 5 inches, but other than that it's perfect!! I'm about 175 pounds, 5'5, DD and the Large fit great!",I absolutely love this dress
883632,5,A2CCDV0J5VB6F2,B01HJHTH5U,I'm 5'6 175lbs. I'm on the tall side. I wear a large and ordered a large and it still has a comfortable amount of room. Not to snug or too loose. Very true to size. Love it,I wear a large and ordered a large and it still has a comfortable amount of room
883633,3,A3O90PACS7B61K,B01HJHTH5U,Too big in the chest area!,Three Stars
883634,3,A2HO94I89U3LNH,B01HJHF97K,"Too clear in the back, needs lining",Three Stars


In [6]:
data.dtypes

overall        int64
reviewerID    object
asin          object
reviewText    object
summary       object
dtype: object

In [7]:
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        unpro_review = data.reviewText.copy()
        unpro_summary = data.summary.copy()
        
        a = time.time()
        pro_review = text_preprocessing(unpro_review.astype(str), stop_words=stop_words, lemmatization=lemmatization)
        print(f"stop_words: {stop_words}, lemmatization: {lemmatization}, time: {time.time() - a}")
        display(pro_review[1])
        name_file = 'review'
        if stop_words:
            name_file += '_stop'
        if lemmatization:
            name_file +='_lem'
        pro_review.to_pickle(os.path.join('data', name_file + '.pickle'))
        del pro_review
        
        a = time.time()
        pro_summary = text_preprocessing(unpro_summary.astype(str), stop_words=stop_words, lemmatization=lemmatization)
        print(f"stop_words: {stop_words}, lemmatization: {lemmatization}, time: {time.time() - a}")
        display(pro_summary[1])
        name_file = 'summary'
        if stop_words:
            name_file += '_stop'
        if lemmatization:
            name_file +='_lem'
        pro_summary.to_pickle(os.path.join('data', name_file + '.pickle'))
        del pro_summary

stop_words: False, lemmatization: False, time: 153.56838274002075


'i agree with the other review the opening is too small i almost bent the hook on some very expensive earrings trying to get these up higher than just the end so they are not seen would not buy again but for the price not sending back'

stop_words: False, lemmatization: False, time: 73.04949116706848


'i agree with the other review the opening is'

stop_words: False, lemmatization: True, time: 1867.2276928424835


'i agree with the other review the opening be too small i almost bend the hook on some very expensive earring try to get these up high than just the end so they be not see would not buy again but for the price not send back'

stop_words: False, lemmatization: True, time: 818.2632191181183


'i agree with the other review the opening be'

stop_words: True, lemmatization: False, time: 147.93516373634338


'agree review opening small almost bent hook expensive earrings trying get higher end seen would buy price sending back'

stop_words: True, lemmatization: False, time: 74.64093971252441


'agree review opening'

stop_words: True, lemmatization: True, time: 1538.4630012512207


'agree review open small almost bent hook expensive earring try get high end see would buy price send back'

stop_words: True, lemmatization: True, time: 797.4107813835144


'agree review opening'