Import all necesarry packages

In [44]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re

### Load in data

In [2]:
#data = [json.loads(line) for line in open('Data/reviews_Musical_Instruments_5.json', 'r')]
data = pd.read_json('Data/reviews_Musical_Instruments_5.json', lines=True)

In [3]:
data

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5,No more pops when I record my vocals.,1392940800,"02 21, 2014"
...,...,...,...,...,...,...,...,...,...
10256,A14B2YH83ZXMPP,B00JBIVXGC,Lonnie M. Adams,"[0, 0]","Great, just as expected. Thank to all.",5,Five Stars,1405814400,"07 20, 2014"
10257,A1RPTVW5VEOSI,B00JBIVXGC,Michael J. Edelman,"[0, 0]",I've been thinking about trying the Nanoweb st...,5,"Long life, and for some players, a good econom...",1404259200,"07 2, 2014"
10258,AWCJ12KBO5VII,B00JBIVXGC,Michael L. Knapp,"[0, 0]",I have tried coated strings in the past ( incl...,4,Good for coated.,1405987200,"07 22, 2014"
10259,A2Z7S8B5U4PAKJ,B00JBIVXGC,"Rick Langdon ""Scriptor""","[0, 0]","Well, MADE by Elixir and DEVELOPED with Taylor...",4,Taylor Made,1404172800,"07 1, 2014"


### Preprocessing of data

In [2]:
## If the spacy model isnt downloaded yet, do that first

# python -m spacy download en_core_web_sm

# Load the model
spac = spacy.load("en_core_web_sm")

In [5]:
# Get the entities in the text
def get_entities(spacy_txt):
    # spacy_txt: spacy object produced by parsing the txt file
    print(spacy_txt)
    for ent in spacy_txt.ents:
        # format of entities inside the object
        # documentation: https://spacy.io/usage/linguistic-features#named-entities
        print(ent.text, "\n", ent.start_char, "\t", ent.end_char, "\t", ent.label_)
        print("__"* 50)
    return None

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)
        
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [12]:
# load a set of stop words
stopwords=get_stop_words("Data/stopwords.txt")

# preprocess the reviewText
data['reviewText'] = data['reviewText'].apply(lambda x:pre_process(x))

#get the text column 
docs=data['reviewText'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv = CountVectorizer(max_df = 0.85, stop_words = stopwords, max_features=10000)
word_count_vector = cv.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


### Building TF-IDF model

In [16]:
tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [17]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [33]:
def get_keywords(text):

    #generate tf-idf for the given document
    tf_idf_vector = tfidf_transformer.transform(cv.transform([text]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items = sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords = extract_topn_from_vector(feature_names, sorted_items, 10)
    
    return keywords

def print_results(text, keywords):
    # now print the results
    print("\n=====Review=====")
    print(text)
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [30]:
# you only needs to do this once
feature_names=cv.get_feature_names()

In [62]:
keyword_list = []

# Print results and create new dataset
for index, row in data.iterrows():
    text = row['reviewText']
    keywords = get_keywords(text)
    keyword_list.append(list(keywords.keys()))
    # print_results(text, keywords)
    
data['keywords'] = keyword_list

In [68]:
data.to_json('Data/processed_reviews.json', orient='records', lines=True)