In [None]:
## ---------------------------------------------------------------------------
## 
## Title:       Homework 06: NLP
## Author(s):   Danfeng Cao, Haonan Wang, Rajasa Kakkera, Ana Parra Vera
## Affiliation: UC Davis MSBA Program
## Description: BAX-422 – Data Design & Representation
## Date:        2020-02-23
## 
## ---------------------------------------------------------------------------


In [1]:
# In the shared zip file, there are 16 JSONs containing information pertaining to product reviews. 
# As the master file with more than 7.8 million lines and 5.1 GB in size could be too large 
# to handle on some personal computers, it was divided into chunks of 500k lines each. 
# Each group is assigned only one of the JSONs within the uploaded zip file.

# Group 1 is to work on the file with ‘.1’ subscript, Group 2 on the one with ‘.2’
# subscript and so on, till Group 15. Group 16 will go back to the start and 
# use the file with .1 subscript, Group 17 will use .2 and so on.

# File .16 is NOT assigned to any group.

# Please mention the file name of the file you used in your solution script as a comment 
# somewhere near the beginning.

# Please note, the files you have been assigned are also not too small in size. If you can easily 
# read and process the json in your machine, that is perfectly fine. But in case you face memory 
# issues, you can refer to the shared starter code notebook file for an alternative method to 
# handle the JSON.

# Read the assigned JSON file and extract the following information:
# • reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
# • asin - ID of the product, e.g. 0000013714
# • reviewerName - name of the reviewer
# • helpful - helpfulness rating of the review, e.g. 2 out of 3 people found the review
#             to be helpful. The starter code’s dataframe would have 2 in the helpful_start
#             column and 3 in the helpful_end column
# • reviewText - text of the review • overall - rating of the product
# • summary - summary of the review
# • unixReviewTime - time of the review

# Solve the following questions:

In [2]:
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

# Uncomment these two lines if you are running these for the first time
# as these sets of words need to be downloaded the first time
# nltk.download('stopwords')
# nltk.download('punkt')

## Using File #3

In [96]:
reviews_data = pd.read_json('reviews_electronics.json/reviews_electronics.3.json', lines=True)

In [97]:
reviews_data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0132793040,"[1, 1]",5,Corey Barker does a great job of explaining Bl...,"04 13, 2013",AKM1MP6P0OYPR,"Vicki Gibson ""momo4""",Very thorough,1365811200
1,B000HDU0Q4,"[0, 0]",5,This adapter is perfect for machines without m...,"06 9, 2014",A2GGBTOAGKO0VE,Randy Rodriguez,Perfect for machines without memory stick readers,1402272000
2,B000HDU0Q4,"[0, 0]",5,Connect and immediately began to work. Pictur...,"05 30, 2012",A2OC5SYI7THCR6,"Ricardo A. Vasquez ""Rialvaca""",Easiest product to use,1338336000
3,B000HDU0Q4,"[1, 1]",5,This card reader has been a good thing because...,"10 12, 2008",ALWL0AJ94O8RJ,RJ,Portable,1223769600
4,B000HDU0Q4,"[0, 5]",4,They shipped me the wrong product and untill I...,"12 30, 2009",A1XTY9T4FYKT7S,Robert A. Heaston Jr.,Shipped wrong Product,1262131200


In [146]:
reviews = reviews_data.reviewText

# Prints the first few reviews
print("First few reviews:")
print(reviews[:4])

First few reviews:
0    Corey Barker does a great job of explaining Bl...
1    This adapter is perfect for machines without m...
2    Connect and immediately began to work.  Pictur...
3    This card reader has been a good thing because...
Name: reviewText, dtype: object


In [147]:
# Prints total reviews
print("Number of reviews:", len(reviews), "\n")

Number of reviews: 500001 



In [148]:
print("Prints full first few reviews:\n")

for review in reviews[:3]:
    print(review, "\n")

Prints full first few reviews:

Corey Barker does a great job of explaining Blend Modes in this DVD. All of the Kelby training videos are great but pricey to buy individually. If you really want bang for your buck just subscribe to Kelby Training online. 

This adapter is perfect for machines without memory stick readers. I bought it for my dad, who has an imac and a sony digital cam. This adapter works perfectly in his imac and reads the sony memory stick flawlessly. 

Connect and immediately began to work.  Picture transfers were never easier. i recommended buying this product right now.  Works with Mac and PC. 



In [149]:
# Converts all reviews to lower case
reviews = reviews.str.lower()

print("Prints full first few reviews after being converted to lowecase:\n")

for review in reviews[:5]:
    print(review, "\n")

Prints full first few reviews after being converted to lowecase:

corey barker does a great job of explaining blend modes in this dvd. all of the kelby training videos are great but pricey to buy individually. if you really want bang for your buck just subscribe to kelby training online. 

this adapter is perfect for machines without memory stick readers. i bought it for my dad, who has an imac and a sony digital cam. this adapter works perfectly in his imac and reads the sony memory stick flawlessly. 

connect and immediately began to work.  picture transfers were never easier. i recommended buying this product right now.  works with mac and pc. 

this card reader has been a good thing because it is small and easily portable.  this was the reason that i purchased it and it has works great.  recommend! 

they shipped me the wrong product and untill i shipped them the wrong one back they would not ship the right one 



# Question 1

In [150]:
def filter_review(review):
    
    # Removes punctuation marks except apostrophes
    review = re.sub("[^a-zA-Z' ]+", "", review)

    stop_words = set(stopwords.words('english')) 

    # Tokenizes sentences in review
    word_tokens = review.split()

    filtered_review = [] 

    # Filters document and keeps words that are not stopwords
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_review.append(w) 

#     print(review, "\n")
#     print(filtered_review)
    
    return(filtered_review)


In [151]:
# Checks that the function above works
print(filter_review(reviews[9]))

["i've", 'used', 'product', 'several', 'times', 'transfer', 'pics', 'sony', 'handycam', 'uses', 'dvd', 'video', 'memory', 'stick', 'pro', 'duo', 'card', 'pics', 'easy', 'use', 'problems', 'far', 'wish', 'cover', 'ends', 'reader', 'comes', 'one', 'clear', 'cover', 'fits', 'either', 'usb', 'end', 'card', 'insert', 'end', 'also', 'noticed', 'prices', 'vary', 'product', 'come', 'memory', 'stick', 'card', 'already', 'reader']


In [152]:
# 1. Treat each review as a document. Ignore misspellings. Convert all reviews 
#    to lower case. Remove stop words (use package to do this).

documents = []

# Gets filtered review from each review and creates list of documents for results
for review in reviews:
    document = filter_review(review)
    documents.append(document)


In [153]:
print("Prints the first few documents:\n")

for document in documents[:5]:
    print(document, "\n")

Prints the first few documents:

['corey', 'barker', 'great', 'job', 'explaining', 'blend', 'modes', 'dvd', 'kelby', 'training', 'videos', 'great', 'pricey', 'buy', 'individually', 'really', 'want', 'bang', 'buck', 'subscribe', 'kelby', 'training', 'online'] 

['adapter', 'perfect', 'machines', 'without', 'memory', 'stick', 'readers', 'bought', 'dad', 'imac', 'sony', 'digital', 'cam', 'adapter', 'works', 'perfectly', 'imac', 'reads', 'sony', 'memory', 'stick', 'flawlessly'] 

['connect', 'immediately', 'began', 'work', 'picture', 'transfers', 'never', 'easier', 'recommended', 'buying', 'product', 'right', 'works', 'mac', 'pc'] 

['card', 'reader', 'good', 'thing', 'small', 'easily', 'portable', 'reason', 'purchased', 'works', 'great', 'recommend'] 

['shipped', 'wrong', 'product', 'untill', 'shipped', 'wrong', 'one', 'back', 'would', 'ship', 'right', 'one'] 



# Question 2

In [154]:
# There are no reviews with the word "not" in them
for document in documents:
    for word in document:
        if word == 'not':
            print(word)
            print(document, "\n")

In [163]:
# Takes an example for adding the changing "another" for "not"
# to be able to handle the "not" case later.
# Uses copy() so underlying documents does not get affected.
example_text = documents[5].copy()

print("Before:")
print(example_text)

# replacing word another for 'not'
for word in example_text:
    if word == 'another':
        print("\nanother is at index", example_text.index('another'))
        example_text[example_text.index('another')] = "not"

print("\nAfter:")
print(example_text)

Before:
['several', 'versions', 'sandisk', 'readers', 'one', 'got', 'memory', 'stick', 'pro', 'duo', 'looking', 'sandisk', 'micromate', 'sd', 'look', 'posts', 'simmilar', 'discription', 'one', 'says', 'sd', 'pro', 'duo', 'another', 'draw', 'back', 'item', 'sizeis', 'huge', 'inserted', 'covers', 'half', 'usb', 'slot', 'firewire', 'port', 'laptop']

another is at index 23

After:
['several', 'versions', 'sandisk', 'readers', 'one', 'got', 'memory', 'stick', 'pro', 'duo', 'looking', 'sandisk', 'micromate', 'sd', 'look', 'posts', 'simmilar', 'discription', 'one', 'says', 'sd', 'pro', 'duo', 'not', 'draw', 'back', 'item', 'sizeis', 'huge', 'inserted', 'covers', 'half', 'usb', 'slot', 'firewire', 'port', 'laptop']


In [164]:
# There are still no reviews with the word "not" in them
for document in documents:
    for word in document:
        if word == 'not':
            print(word)
            print(document, "\n")

## NOT Handler
Handling 'not' cases

In [178]:
test_str = ['well', 'its', 'kinda', 'ok', 'but', 'not', 'ok']

print("Before:")
print("Number of words", len(test_str))
print(test_str, "\n")

for word in test_str:
    if word == 'not':
        print("\'not' is at index", test_str.index('not'), "\n")
        
        # Checks to see if there is a following word after 'not'
        if (test_str.index('not') + 1) < len(test_str):
            test_str[(test_str.index('not') + 1)] = 'not_' + test_str[(test_str.index('not') + 1)]
            test_str.pop(test_str.index('not'))
        else:
            test_str.pop(test_str.index('not'))

print("After:")
print("Number of words", len(test_str))
print(test_str)     

Before:
Number of words 7
['well', 'its', 'kinda', 'ok', 'but', 'not', 'ok'] 

'not' is at index 5 

After:
Number of words 6
['well', 'its', 'kinda', 'ok', 'but', 'not_ok']


In [180]:
test_str = ['well', 'its', 'not', 'kinda', 'ok', 'but', 'not', 'ok']

print("Before:")
print("Number of words", len(test_str))
print(test_str, "\n")

for word in test_str:
    if word == 'not':
        print("\'not' is at index", test_str.index('not'), "\n")
        
        # Checks to see if there is a following word after 'not'
        if (test_str.index('not') + 1) < len(test_str):
            if test_str[(test_str.index('not') + 1)] != 'not':
                test_str[(test_str.index('not') + 1)] = 'not_' + test_str[(test_str.index('not') + 1)]
                test_str.pop(test_str.index('not'))
            else:
                test_str.pop(test_str.index('not'))
                test_str.pop(test_str.index('not'))
        else:
            test_str.pop(test_str.index('not'))


print("After:")
print("Number of words", len(test_str))
print(test_str)     

Before:
Number of words 8
['well', 'its', 'not', 'kinda', 'ok', 'but', 'not', 'ok'] 

'not' is at index 2 

'not' is at index 5 

After:
Number of words 6
['well', 'its', 'not_kinda', 'ok', 'but', 'not_ok']


In [265]:
test_str = ['well', 'its', 'not', 'kinda', 'ok', 'but', 'ok', 'not', 'not', 'not', 'not']

print("Before:")
print("Number of words", len(test_str))
print(test_str, "\n")

i = 0
while i < len(test_str):
    if test_str[i] == 'not':
#         print("\'not' is at index", test_str.index('not'), ', i=', i, "\n")
        
        # Checks to see if there is a following word after 'not'
        if (test_str.index('not') + 1) < len(test_str):
            if test_str[(test_str.index('not') + 1)] != 'not':
                test_str[(test_str.index('not') + 1)] = 'not_' + test_str[(test_str.index('not') + 1)]
                test_str.pop(test_str.index('not'))
            else:
                while (i+1 < len(test_str)) & (test_str[i]=='not'):
                    test_str.pop(test_str.index('not'))
                    i-=1
        else:
            test_str.pop(test_str.index('not'))
    i+=1

print("After:")
print("Number of words", len(test_str))
print(test_str)     

Before:
Number of words 11
['well', 'its', 'not', 'kinda', 'ok', 'but', 'ok', 'not', 'not', 'not', 'not'] 

After:
Number of words 6
['well', 'its', 'not_kinda', 'ok', 'but', 'ok']


In [307]:
def not_handler(document):

    i = 0
    while i < len(document):
        if document[i] == 'not':
            # Checks to see if there is a following word after 'not'
            if (document.index('not') + 1) < len(document):
                if document[(document.index('not') + 1)] != 'not':
                    document[(document.index('not') + 1)] = 'not_' + document[(document.index('not') + 1)]
                    document.pop(document.index('not'))
                else:
                    while (i+1 < len(document)) & (document[i]=='not'):
                        document.pop(document.index('not'))
                        i-=1
            else:
                document.pop(document.index('not'))
        i+=1

    return(document)

In [269]:
test_str = ['well', 'its', 'not', 'kinda', 'ok', 'but', 'ok', 'not', 'not', 'not', 'not']

not_handler(test_str)

['well', 'its', 'not_kinda', 'ok', 'but', 'ok']

## N'T Handler
Handling n't cases (e.g., don't, didn't etc.)

In [294]:
test_str = ['its', 'kinda', 'ok', 'i', "don't", "know", "if", "we", "won't", "be", 'ok', "weren't"]

print("Before:")
print("Number of words", len(test_str))
print(test_str, "\n")

i = 0
while i < len(test_str):
    if "'" in test_str[i]:        
        # Checks to see if there is a following word after word ending in "n't"
        if (i+1) < len(test_str):
            test_str[i+1] = 'not_' + test_str[i+1]
            test_str.pop(i)
        else:
            test_str.pop(i)
    i+=1

print("After:")
print("Number of words", len(test_str))
print(test_str)     

Before:
Number of words 12
['its', 'kinda', 'ok', 'i', "don't", 'know', 'if', 'we', "won't", 'be', 'ok', "weren't"] 

After:
Number of words 9
['its', 'kinda', 'ok', 'i', 'not_know', 'if', 'we', 'not_be', 'ok']


In [304]:
def n_apostrophe_t_handler(document):

    i = 0
    while i < len(document):
        if "'" in document[i]:        
            # Checks to see if there is a following word after word ending in "n't"
            if (i+1) < len(document):
                document[i+1] = 'not_' + document[i+1]
                document.pop(i)
            else:
                document.pop(i)
        i+=1

    return(document)

In [305]:
test_str = ['its', 'kinda', 'ok', 'i', "don't", "know", "if", "we", "won't", "be", 'ok', "weren't"]

n_apostrophe_t_handler(test_str)

['its', 'kinda', 'ok', 'i', 'not_know', 'if', 'we', 'not_be', 'ok']

## Combining the functions

In [320]:
# 2. Handle negation: Look for the following two negations: "not" and "xxxn't" 
#    (e.g., don't, didn't etc.). Split off the "n't" part from "xxxn't" into 
#    separate words "xxx" and "not". Attach all 'not's (any normal "not" 
#    appearing in the text as well as the "n't" converted to "not") to the
#    subsequent word, eg "not_nextword" (code this yourself).


def negation_handler(document):

    i = 0
    while i < len(document):

        # Checks for not case
        if document[i] == 'not':
            # Checks to see if there is a following word after 'not'
            if (document.index('not') + 1) < len(document):
                if document[(document.index('not') + 1)] != 'not':
                    document[(document.index('not') + 1)] = 'not_' + document[(document.index('not') + 1)]
                    document.pop(document.index('not'))
                else:
                    while (i+1 < len(document)) & (document[i]=='not'):
                        document.pop(document.index('not'))
                        i-=1
            else:
                document.pop(document.index('not'))
                i-=1

        # Checks for word ending in "n't" case
        if "'" in document[i]:   
            # Checks to see if there is a following word after word ending in "n't"
            if (i+1) < len(document):
                document[i+1] = 'not_' + document[i+1]
                document.pop(i)
            else:
                document.pop(i)
        i+=1

    return(document)

In [321]:
test_str1 = ['its', 'kinda', 'ok', 'i', "don't", "know", "if", "we", "won't", "be", 'ok', "weren't"]
negation_handler(test_str1)


['its', 'kinda', 'ok', 'i', 'not_know', 'if', 'we', 'not_be', 'ok']

In [322]:
test_str2 = ['well', 'its', 'not', 'kinda', 'ok', 'but', 'ok', 'not', 'not', 'not', 'not']
negation_handler(test_str2)


['well', 'its', 'not_kinda', 'ok', 'but', 'ok']

In [323]:
test_str3 = ['well', 'its', 'not', 'kinda', 'ok', 'but', 'ok', 'not', 'not', 'not', 'not',
             'i', "don't", "know", "if", "we", "won't", "be", 'ok', "weren't"]
negation_handler(test_str3)

['well',
 'its',
 'not_kinda',
 'ok',
 'but',
 'ok',
 'not_i',
 'not_know',
 'if',
 'we',
 'not_be',
 'ok']

In [347]:
negated_documents = []

# Handles negations
for document in documents:
    negated_document = negation_handler(document)
    negated_documents.append(negated_document)


# Question 3

In [327]:
porter = PorterStemmer()

# provide a word to be stemmed
print("Porter Stemmer")
print(porter.stem("cats"))
print(porter.stem("trouble"))
print(porter.stem("troubling"))

Porter Stemmer
cat
troubl
troubl


In [328]:
test_str

['its', 'kinda', 'ok', 'i', 'not_know', 'if', 'we', 'not_be', 'ok']

In [329]:
for word in test_str:
    print(word, "becomes", porter.stem(word))

its becomes it
kinda becomes kinda
ok becomes ok
i becomes i
not_know becomes not_know
if becomes if
we becomes we
not_be becomes not_b
ok becomes ok


In [332]:
print(example_text)

['several', 'versions', 'sandisk', 'readers', 'one', 'got', 'memory', 'stick', 'pro', 'duo', 'looking', 'sandisk', 'micromate', 'sd', 'look', 'posts', 'simmilar', 'discription', 'one', 'says', 'sd', 'pro', 'duo', 'not', 'draw', 'back', 'item', 'sizeis', 'huge', 'inserted', 'covers', 'half', 'usb', 'slot', 'firewire', 'port', 'laptop']


In [333]:
for word in example_text:
    print(word, "becomes", porter.stem(word))

several becomes sever
versions becomes version
sandisk becomes sandisk
readers becomes reader
one becomes one
got becomes got
memory becomes memori
stick becomes stick
pro becomes pro
duo becomes duo
looking becomes look
sandisk becomes sandisk
micromate becomes microm
sd becomes sd
look becomes look
posts becomes post
simmilar becomes simmilar
discription becomes discript
one becomes one
says becomes say
sd becomes sd
pro becomes pro
duo becomes duo
not becomes not
draw becomes draw
back becomes back
item becomes item
sizeis becomes sizei
huge becomes huge
inserted becomes insert
covers becomes cover
half becomes half
usb becomes usb
slot becomes slot
firewire becomes firewir
port becomes port
laptop becomes laptop


In [337]:
# 3. Stem all words using Porter 1979 (use package to do this).

def porter_stemmer(document):
    stemmed_documents = []
    for word in document:
        stemmed_documents.append(porter.stem(word))
    return(stemmed_documents)


In [346]:
# Testing the function

for document in documents[:4]:
    print(document, "\n\n", porter_stemmer(document), "\n\n")

['corey', 'barker', 'great', 'job', 'explaining', 'blend', 'modes', 'dvd', 'kelby', 'training', 'videos', 'great', 'pricey', 'buy', 'individually', 'really', 'want', 'bang', 'buck', 'subscribe', 'kelby', 'training', 'online'] 

 ['corey', 'barker', 'great', 'job', 'explain', 'blend', 'mode', 'dvd', 'kelbi', 'train', 'video', 'great', 'pricey', 'buy', 'individu', 'realli', 'want', 'bang', 'buck', 'subscrib', 'kelbi', 'train', 'onlin'] 


['adapter', 'perfect', 'machines', 'without', 'memory', 'stick', 'readers', 'bought', 'dad', 'imac', 'sony', 'digital', 'cam', 'adapter', 'works', 'perfectly', 'imac', 'reads', 'sony', 'memory', 'stick', 'flawlessly'] 

 ['adapt', 'perfect', 'machin', 'without', 'memori', 'stick', 'reader', 'bought', 'dad', 'imac', 'soni', 'digit', 'cam', 'adapt', 'work', 'perfectli', 'imac', 'read', 'soni', 'memori', 'stick', 'flawlessli'] 


['connect', 'immediately', 'began', 'work', 'picture', 'transfers', 'never', 'easier', 'recommended', 'buying', 'product', 'righ

In [358]:
stemmed_documents = []

# Stems negated documents
for document in negated_documents:
    stemmed_document = porter_stemmer(document)
    stemmed_documents.append(stemmed_document)


In [360]:
stemmed_documents[5]

['sever',
 'version',
 'sandisk',
 'reader',
 'one',
 'got',
 'memori',
 'stick',
 'pro',
 'duo',
 'look',
 'sandisk',
 'microm',
 'sd',
 'look',
 'post',
 'simmilar',
 'discript',
 'one',
 'say',
 'sd',
 'pro',
 'duo',
 'anoth',
 'draw',
 'back',
 'item',
 'sizei',
 'huge',
 'insert',
 'cover',
 'half',
 'usb',
 'slot',
 'firewir',
 'port',
 'laptop']

# Question 4

In [374]:
# Makes a single array from all the sub-arrays (aka documents)
# Uses only the first 5 documents
np.hstack(stemmed_documents[:5])

['corey' 'barker' 'great' 'job' 'explain' 'blend' 'mode' 'dvd' 'kelbi'
 'train' 'video' 'great' 'pricey' 'buy' 'individu' 'realli' 'want' 'bang'
 'buck' 'subscrib' 'kelbi' 'train' 'onlin' 'adapt' 'perfect' 'machin'
 'without' 'memori' 'stick' 'reader' 'bought' 'dad' 'imac' 'soni' 'digit'
 'cam' 'adapt' 'work' 'perfectli' 'imac' 'read' 'soni' 'memori' 'stick'
 'flawlessli' 'connect' 'immedi' 'began' 'work' 'pictur' 'transfer'
 'never' 'easier' 'recommend' 'buy' 'product' 'right' 'work' 'mac' 'pc'
 'card' 'reader' 'good' 'thing' 'small' 'easili' 'portabl' 'reason'
 'purchas' 'work' 'great' 'recommend' 'ship' 'wrong' 'product' 'until'
 'ship' 'wrong' 'one' 'back' 'would' 'ship' 'right' 'one']


In [391]:
# Casts to pandas Series and creates the frequency count for word stems in reviews
# Uses only the first 5 documents
pd.DataFrame(pd.Series(np.hstack(stemmed_documents[:5])).value_counts(), columns=["count"])

Unnamed: 0,count
work,4
ship,3
great,3
stick,2
reader,2
product,2
memori,2
soni,2
buy,2
right,2


In [392]:
# 4. Create frequency count table for all word stems in all reviews (this will be big). 
#    Use the most frequent 500 words to define the word vector. Create a bag-of-word 
#    vector representation for each review in electronics.

frequency_count = pd.DataFrame(pd.Series(np.hstack(stemmed_documents)).value_counts(), columns=["count"])
frequency_count = frequency_count.reset_index()
frequency_count = frequency_count.rename({"index": "word"}, axis=1)


In [401]:
frequency_count.head()

Unnamed: 0,word,count
0,use,293705
1,work,251152
2,one,195244
3,great,182476
4,get,149235


In [467]:
# Uses the most frequent 500 words to define the word vector
word_vector = frequency_count.head(500)

print("Word vector has", word_vector.shape[0], "words")

word_vector.head()

Word vector has 500 words


Unnamed: 0,word,count
0,use,293705
1,work,251152
2,one,195244
3,great,182476
4,get,149235


In [480]:
# Uses the most frequent 500 words to define the word vector (vocabulary)
# and previous the first 10 words in the vocabulary
vocabulary = word_vector.word.tolist()
print(vocabulary[:10])

['use', 'work', 'one', 'great', 'get', 'good', 'would', 'like', 'product', 'sound']


In [474]:
def get_bag_of_words(document, vocabulary):
    bag_of_words = {}

    for word in vocabulary:
        bag_of_words[word] = 0

        if word in document:
            bag_of_words[word] = 1

    return bag_of_words

- [cat, sat, hat]
    - the cats sat=[1,1,0]

In [477]:
# Creates a bag-of-word vector representation for the first 15 reviews in electronics
for document in stemmed_documents[:15]:
    print(get_bag_of_words(document, vocabulary), "\n")

{'use': 0, 'work': 0, 'one': 0, 'great': 1, 'get': 0, 'good': 0, 'would': 0, 'like': 0, 'product': 0, 'sound': 0, 'camera': 0, 'time': 0, 'well': 0, 'need': 0, 'price': 0, 'qualiti': 0, 'look': 0, 'bought': 0, 'buy': 1, 'cabl': 0, 'purchas': 0, 'also': 0, 'want': 1, 'problem': 0, 'even': 0, 'much': 0, 'realli': 1, 'tv': 0, 'make': 0, 'easi': 0, 'set': 0, 'batteri': 0, 'speaker': 0, 'pictur': 0, 'go': 0, 'unit': 0, 'littl': 0, 'better': 0, 'card': 0, 'drive': 0, 'thing': 0, 'take': 0, 'comput': 0, 'back': 0, 'case': 0, 'recommend': 0, 'got': 0, 'year': 0, 'player': 0, 'new': 0, 'tri': 0, 'could': 0, 'still': 0, 'review': 0, 'nice': 0, 'first': 0, 'fit': 0, 'power': 0, 'two': 0, 'love': 0, 'come': 0, 'plug': 0, 'connect': 0, 'ipod': 0, 'video': 1, 'instal': 0, 'mount': 0, 'way': 0, 'screen': 0, 'devic': 0, 'say': 0, 'day': 0, 'small': 0, 'replac': 0, 'seem': 0, 'right': 0, 'laptop': 0, 'best': 0, 'lot': 0, 'turn': 0, 'find': 0, 'light': 0, 'system': 0, 'without': 0, 'receiv': 0, 'put': 0

In [492]:
document_bags = [] # list of dictionaries

# Creates a bag-of-word vector representation for all reviews in electronics
for document in stemmed_documents:
    document_bags.append(get_bag_of_words(document, vocabulary))


In [496]:
print(stemmed_documents[400])

['problem', 'product', 'fit', 'mr', 'buddi', 'heater', 'work', 'like', 'suppos', 'want', 'use', 'fan', 'portion', 'big', 'buddi']


In [493]:
document_bags[400].keys()

dict_keys(['use', 'work', 'one', 'great', 'get', 'good', 'would', 'like', 'product', 'sound', 'camera', 'time', 'well', 'need', 'price', 'qualiti', 'look', 'bought', 'buy', 'cabl', 'purchas', 'also', 'want', 'problem', 'even', 'much', 'realli', 'tv', 'make', 'easi', 'set', 'batteri', 'speaker', 'pictur', 'go', 'unit', 'littl', 'better', 'card', 'drive', 'thing', 'take', 'comput', 'back', 'case', 'recommend', 'got', 'year', 'player', 'new', 'tri', 'could', 'still', 'review', 'nice', 'first', 'fit', 'power', 'two', 'love', 'come', 'plug', 'connect', 'ipod', 'video', 'instal', 'mount', 'way', 'screen', 'devic', 'say', 'day', 'small', 'replac', 'seem', 'right', 'laptop', 'best', 'lot', 'turn', 'find', 'light', 'system', 'without', 'receiv', 'put', 'think', 'usb', 'headphon', 'anoth', 'music', 'differ', 'play', 'month', 'amazon', 'sinc', 'around', 'know', 'featur', 'order', 'see', 'fine', 'old', 'long', 'enough', 'item', 'charg', 'hard', 'far', 'len', 'mous', 'run', 'mani', 'bit', 'expect',

In [494]:
document_bags[400].values()

dict_values([1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 

In [500]:
bag_of_words_df = pd.DataFrame(document_bags, index=reviews_data['reviewerID'], columns=vocabulary)
bag_of_words_df.head(10)

Unnamed: 0_level_0,use,work,one,great,get,good,would,like,product,sound,...,media,improv,disc,smaller,later,pack,caus,internet,fall,someon
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AKM1MP6P0OYPR,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2GGBTOAGKO0VE,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2OC5SYI7THCR6,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ALWL0AJ94O8RJ,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1XTY9T4FYKT7S,0,0,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
AC4IGGPO6HY0M,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AUPNX6P6DQCT1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A5G11Q0FLWMCK,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A3V891TW1F3P9T,1,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AB7BV8O5IVU27,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Question 5

In [539]:
# 5. Establish a method for measuring the distance between different reviews. 
#    Print the mutual distance between the first 100 reviews (review IDs) to 
#    screen, sorted from closest to furthest.

rewiew_distances = pd.DataFrame(euclidean_distances(bag_of_words_df.head(100), bag_of_words_df.head(100)),
                                index=reviews_data.loc[:99, "reviewerID"], 
                                columns=reviews_data.loc[:99, "reviewerID"])
rewiew_distances.head()

reviewerID,AKM1MP6P0OYPR,A2GGBTOAGKO0VE,A2OC5SYI7THCR6,ALWL0AJ94O8RJ,A1XTY9T4FYKT7S,AC4IGGPO6HY0M,AUPNX6P6DQCT1,A5G11Q0FLWMCK,A3V891TW1F3P9T,AB7BV8O5IVU27,...,A2EZNCU82BWZ3G,A2TZFTHOSBDRRZ,A1FK81HNJI5F5W,A14Z6D3IRJ23F7,A1BKDM9SCYRU3J,A2FXBWR4T4OFQ,A23FWFLIALY2HO,A2MYCM10JM7URK,A23QB16B7HJFU1,AXJL280X6VMEM
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AKM1MP6P0OYPR,0.0,4.472136,4.123106,4.123106,3.872983,5.09902,4.582576,4.358899,6.164414,5.744563,...,5.567764,3.464102,4.582576,6.480741,5.385165,5.0,3.605551,6.708204,7.141428,6.0
A2GGBTOAGKO0VE,4.472136,0.0,4.582576,4.582576,4.358899,5.09902,5.0,3.872983,6.0,5.91608,...,6.244998,4.242641,5.385165,6.928203,5.567764,5.385165,4.358899,7.0,7.549834,6.324555
A2OC5SYI7THCR6,4.123106,4.582576,0.0,4.242641,3.741657,5.385165,4.898979,4.472136,6.403124,6.0,...,5.830952,4.123106,5.09902,6.557439,5.477226,5.09902,4.242641,6.63325,7.483315,6.082763
ALWL0AJ94O8RJ,4.123106,4.582576,4.242641,0.0,4.242641,5.385165,4.472136,4.242641,6.082763,6.164414,...,5.656854,4.123106,5.09902,6.403124,5.656854,5.09902,3.741657,6.928203,7.483315,6.082763
A1XTY9T4FYKT7S,3.872983,4.358899,3.741657,4.242641,0.0,4.582576,4.690416,4.242641,6.082763,5.656854,...,5.656854,3.605551,4.690416,6.244998,5.291503,4.690416,3.741657,6.78233,7.071068,5.91608


# Rajasa's Part

In [1]:
# 6. Run a PCA and graph the first two PCs for the first 100 reviews. Does your
#    graph reflect your findings from the previous exercise?
input_vector = pd.DataFrame(document_bags,columns=vocabulary)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(input_vector[:100])
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
print(pca.explained_variance_ratio_)
principalDf.head()


NameError: name 'pd' is not defined

In [None]:
# 7. Is it possible to use logistic regression to predict the rating (5 levels)
#    of a product? Why may it be advantageous to use a logistic regression to 
#    predict the 5-star rating over MNL with 5 categories?



In [None]:
# 8. Perform a lasso logistic regression and measure the out-of-sample accuracy
#    of your method of choice.

#creating the dataframe for logistics regression
log_vector = input_vector.copy()
log_vector['rating'] = reviews_data['overall']
log_vector.head()

In [None]:
#Splitting the data to training & test
X = log_vector.drop(['rating'],axis = 1)
y = log_vector['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#Performing lasso logistics regression
lassolog = LogisticRegressionCV(n_jobs=2, penalty='l1', solver='liblinear', cv=10, scoring = 'accuracy', random_state=0)
lassolog.fit(X_train, y_train)

In [None]:
#measuring the out-of-sample performance
lassolog.score(X_test, y_test)

In [None]:
# --- 0.5 bonus points if the following methods are run on all reviews
# (across all the files). Please just continue using the reviews in 
# your assigned file otherwise. ---

# 9. Implement a method to aggregate reviews by product. Can you use any
# of the other columns to help with aggregation? Explain why or why not. 
# Please clearly explain your method.
new_vector = input_vector.copy()
new_vector['product_category'] = reviews_data['asin']
new_vector['product_rating'] = reviews_data['overall']
product_vector = pd.DataFrame(new_vector.groupby(["product_category"]).mean())

In [None]:
# 10. Establish a method for measuring the distance between different products.
product_distances = pd.DataFrame(euclidean_distances(product_vector, product_vector),index = product_vector.index,columns = product_vector.index)
product_distances.head()