In [1]:
import os
import time
from IPython.display import display

import pandas as pd
import nltk
import re
import contractions
from tqdm import tqdm

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

t = time.time()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [3]:
def text_preprocessing(serie, stop_words=True, lemmatization=True):
    # lowercase
    serie = serie.map(lambda x: x.lower())   

    # remove extra newlines
    serie = serie.map(lambda x: re.sub(r'[\r|\n|\r\n]+', ' ', x))

    # remove @tag
    serie = serie.map(lambda x: re.sub(r'@[\S]+', '', x))

    # remove URL
    serie = serie.map(lambda x: re.sub('https?://[\S]+', '', x))

    # remove contractions
    serie = serie.map(lambda x: contractions.fix(x).lower())

    # remove hashtag and numbers
    serie = serie.map(lambda x: re.sub("[^a-zA-Z]", " ", x))

    # tokenization
    serie = serie.map(word_tokenize)

    if stop_words:        
        # remove stop words
        stop_words = set(stopwords.words('english'))
        serie = serie.map(lambda x: [word for word in x if word not in stop_words])
    
    if lemmatization:
        # lemmatization    
        serie = serie.map(nltk.tag.pos_tag)
        serie = serie.map(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
        wordnet_lemmatizer = WordNetLemmatizer()
        serie = serie.map(lambda x: [wordnet_lemmatizer.lemmatize(word, tag) for (word, tag) in x])
    
    serie = serie.map(lambda x: ' '.join(word for word in x))

    return serie

In [4]:
df_list = []
chunksize = 500000

for df_chunk in tqdm(pd.read_json('data/CDs_and_Vinyl.json', lines=True, chunksize=chunksize)):
    df_list.append(df_chunk)
    
data = pd.concat(df_list)

del df_list
del df_chunk

data = data[['overall', 'reviewerID', 'asin', 'reviewText', 'summary']]
data

10it [01:04,  6.47s/it]


Unnamed: 0,overall,reviewerID,asin,reviewText,summary
0,5,A171I27YBM4FL6,0001393774,I love this CD. So inspiring!,Five Stars
1,5,A1H1DL4K669VQ9,0001393774,Love it!! Great seller!,Five Stars
2,5,A23WIHT5886G36,0001393774,I bought this on cassette tape in the 80's. So...,I bought this on cassette tape in the 80's. ...
3,5,A3SZNOJP8OL26X,0001393774,as good as i remember back when i bought the o...,Five Stars
4,5,A3V5XBBT7OZG5G,0001393774,One of my very favourite albums from one of my...,One of my very favourite albums from one of my...
...,...,...,...,...,...
4543364,5,A1KCWOF28NPM3M,B01HJG3VZI,"Great 5 LP by The Guess Who, love all these al...",Guess Who 5 Pack!
4543365,5,ASOJC2B605GDG,B01HJG3UQI,Brings back many MEMORIES of a time gone bye.,Five Stars
4543366,4,A13EWHP1W5X77J,B01HJG3UQI,Her string of albums were really enjoyable to ...,Enjoyable collection of albums
4543367,5,A1QN6Z2E6M0S2D,B01HJG3UQI,"If you decide to buy this, be sure to hold on ...",A great way to add to your collection


In [5]:
min_amount_product_mentions = 20
min_amount_user_mentions = 20
filtered_df = data[['asin', 'reviewerID', 'overall', 'reviewText', 'summary']]
    
filtered_df = filtered_df.drop_duplicates()    
filtered_df = filtered_df[filtered_df['asin'].map(filtered_df['asin'].value_counts()) >= min_amount_product_mentions]
filtered_df = filtered_df[filtered_df['reviewerID'].map(filtered_df['reviewerID'].value_counts()) >= min_amount_user_mentions]

In [6]:
len(filtered_df)

453961

In [7]:
df = filtered_df[~filtered_df.reviewText.isna()]
df

Unnamed: 0,asin,reviewerID,overall,reviewText,summary
6,0001393774,A3478QRKQDOPQ2,5,I recall loving his other albums and maybe thi...,forgot but I figured on some of these artists ...
32,0001393774,A12R54MKO17TW0,5,Keith Green / Songs for the Shepherd: His pre...,His last album is focused on Praise
37,0001393774,AEKGGV851HY3K,5,Keith Green had a passionate love for Jesus. ...,Passionate Faith Is Contagious
117,0005164885,A30M3WWF54M74L,5,Bought to replace original I purchased many ye...,Had to have for Traditional Christmas music co...
150,0005164885,A3LEN0P07MGJE2,5,I love this CD! It is always part of my Christ...,I Love this CD!
...,...,...,...,...,...
4543140,B01HHGAJJ6,A77SH4285YSLC,4,Good job,Four Stars
4543226,B01HIDSULM,A1AGFKZFJKFNFB,5,Luv my Heart,Five Stars
4543228,B01HIDSULM,A37NSW719W1HVV,5,Ann's still got it ... and I doubt very seriou...,Still got it ...
4543263,B01HIE1OYQ,A1LRS1JGPULB5G,5,I first heard of these guys was when I picked ...,ABSOLUTELY FANTASTIC


In [13]:
ratings = df.overall.copy()
ratings.to_pickle(os.path.join('data', 'CD_ratings.pickle'))

In [8]:
pd.set_option('max_colwidth', 400)

text = df.loc[6, 'reviewText']
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        print(text_preprocessing(pd.Series([text]), stop_words=stop_words, lemmatization=lemmatization))

0    i recall loving his other albums and maybe this one too forgot but i figured on some of these artists seems like one good album and all good albums especially in christian music seemed when they got into it they stayed into it and so good to double check though if want too but it is a possible very good album because i usually recall if too bad of one and i do not on this one
dtype: object
0    i recall love his other album and maybe this one too forgot but i figure on some of these artist seem like one good album and all good album especially in christian music seem when they get into it they stay into it and so good to double check though if want too but it be a possible very good album because i usually recall if too bad of one and i do not on this one
dtype: object
0    recall loving albums maybe one forgot figured artists seems like one good album good albums especially christian music seemed got stayed good double check though want possible good album usually recall bad one 

In [9]:
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        unpro_review = df.reviewText.copy()
        unpro_summary = df.summary.copy()
        
        a = time.time()
        pro_review = text_preprocessing(unpro_review.astype(str), stop_words=stop_words, lemmatization=lemmatization)
        print(f"stop_words: {stop_words}, lemmatization: {lemmatization}, time: {time.time() - a}")
        display(pro_review[6])
        name_file = 'review'
        if stop_words:
            name_file += '_stop'
        if lemmatization:
            name_file +='_lem'
        pro_review.to_pickle(os.path.join('data', 'CD_' + name_file + '.pickle'))
        del pro_review
        
        a = time.time()
        pro_summary = text_preprocessing(unpro_summary.astype(str), stop_words=stop_words, lemmatization=lemmatization)
        print(f"stop_words: {stop_words}, lemmatization: {lemmatization}, time: {time.time() - a}")
        display(pro_summary[6])
        name_file = 'summary'
        if stop_words:
            name_file += '_stop'
        if lemmatization:
            name_file +='_lem'
        pro_summary.to_pickle(os.path.join('data', 'CD_' + name_file + '.pickle'))
        del pro_summary

stop_words: False, lemmatization: False, time: 293.25139474868774


'i recall loving his other albums and maybe this one too forgot but i figured on some of these artists seems like one good album and all good albums especially in christian music seemed when they got into it they stayed into it and so good to double check though if want too but it is a possible very good album because i usually recall if too bad of one and i do not on this one'

stop_words: False, lemmatization: False, time: 43.14775776863098


'forgot but i figured on some of these artists seems like one good album and all good albums'

stop_words: False, lemmatization: True, time: 4191.941154956818


'i recall love his other album and maybe this one too forgot but i figure on some of these artist seem like one good album and all good album especially in christian music seem when they get into it they stay into it and so good to double check though if want too but it be a possible very good album because i usually recall if too bad of one and i do not on this one'

stop_words: False, lemmatization: True, time: 468.08725214004517


'forgot but i figure on some of these artist seem like one good album and all good album'

stop_words: True, lemmatization: False, time: 293.5798110961914


'recall loving albums maybe one forgot figured artists seems like one good album good albums especially christian music seemed got stayed good double check though want possible good album usually recall bad one one'

stop_words: True, lemmatization: False, time: 38.40134644508362


'forgot figured artists seems like one good album good albums'

stop_words: True, lemmatization: True, time: 3023.571521282196


'recall love album maybe one forgot figure artist seem like one good album good album especially christian music seem get stayed good double check though want possible good album usually recall bad one one'

stop_words: True, lemmatization: True, time: 404.49207043647766


'forgot figured artist seem like one good album good album'

In [10]:
print(f"Execution time : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Execution time : 02:27:31
