#### Get spreadsheet names

In [1]:
!ls

aspects_B002XTCAXG.csv	B01J8B219O.csv	B07TJ41TMC.csv
B000LJWV4S.csv		B06VTLLC13.csv	B07TZ3BMKW.csv
B002XTCAXG.csv		B07568DFCH.csv	B07X5TTTJT.csv
B00GZYNX8G.csv		B07DQKV38W.csv	requirements.txt
B00OYKXTDW.csv		B07G3SJPLZ.csv	Sentence_Partition.ipynb
B010ESCLHW.csv		B07JCFY3N4.csv	Untitled.ipynb


### Import Libraries

In [2]:
import pandas as pd
import time
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize


#### Requirement file

In [3]:
# !pip install -r requirements.txt
# !pip freeze > requirements.txt



### Read a spreadsheet

In [4]:
filename = "B002XTCAXG.csv"
df = pd.read_csv(filename)
print(f"Original shape: {df.shape}")

# Keep column reviews only
df = df[['reviews']]
print(f"New shape: {df.shape}")
df.head(3)


Original shape: (80, 19)
New shape: (80, 1)


Unnamed: 0,reviews
0,this thing lights up the night sky! bought it ...
1,"I own three Streamlight weapon-mounted lights,..."
2,Great flashlight that is bright and lasts! Fir...


#### Partition reviews into sentences

In [8]:
reviews_corpus = list(df['reviews'])
# Partition into sentences
reviews_in_sentences = [sent_tokenize(review.lower()) for review in reviews_corpus]
reviews_length = [len(review) for review in reviews_in_sentences]
print(f"Number of reviews: {len(reviews_in_sentences)}")
print(f"Number of sentences in each review: {reviews_length}")


Number of reviews: 80
Number of sentences in each review: [4, 10, 14, 7, 11, 7, 8, 6, 18, 4, 9, 4, 4, 6, 6, 4, 7, 1, 1, 1, 1, 4, 7, 2, 1, 1, 2, 6, 4, 2, 5, 11, 5, 4, 3, 7, 1, 1, 3, 3, 7, 1, 3, 3, 2, 1, 2, 5, 4, 2, 3, 1, 3, 4, 4, 1, 1, 1, 2, 6, 3, 3, 2, 2, 5, 4, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 3, 2, 2, 2]


### !!! Need extra work on this!!!
#### Use Fliashlight corpus for matching sentences into aspects
https://docs.google.com/document/d/1ZiQVK4czqH0UGWZM1XaEhribQUT0xkI71Xh_aCGtw1E/edit


In [9]:
size_keywords_string = "Size, small, tiny, little, petite, slim, compact, large, big, giant,\
huge, enormous, gigantic, bulky, colossal, massive, sizable, weight, heavy, lightweight, cumbersome"

quality_keywords_string = "Build, built, quality, sturdy, sturdiness, durable, tough, scratch, scratch,\
coating, solid, feel cheap, cheaply, aluminum, steel, titanium, brass, copper,\
material, metal, rubber, plastic, nylon, bent, broke, faulty, shatter,\
waterproof, dustproof, corrosion, ingress, drop, shock, impact, resistance,\
screws, threads, knurling, anodized, flicker, housing, indestructible,\
wet, temperature, hot, heat, overheat, cold, bezel"

battery_keywords_string = "Battery, batteries, rechargeable, charge, charging,\
recharge, USB, solar, runtime, hours, lifetime, dies, died, dead"

design_keywords_string = "Features, design, setting, settings, mode, modes,\
interface, programmable, memory, dim, roll, design, roll, upright, stand, tailstanding, \
strobe, sos, float, warranty, grip, rotate, rotating, head, hang, lantern, eco, adjustable,\
zoom, clip, lanyard, holster, indicator, easy to use, versatile, switch, twisty twist,\
activation, clicky, click, magnetize, accessories"

beam_keywords_string = "Power, project, projects, far, illuminate, shine,\
focus, distance, range, feet, meters, beam, distance, visibility, throw, \
flood, lumens, bright, lens, optics, frosted, reflector, mule, LED, tint, \
colour, color, hotspot, spill, corona, lux, candelas, intensity, lights"

price_keywords_string = "Price, cost, costly, pricey, expensive, overpriced, reasonable,\
unreasonable, value, affordable, cheap, $, bargain, budget, cash, discount, money, sale"


size_keywords = size_keywords_string.lower().replace(" ","").split(",")
quality_keywords = quality_keywords_string.lower().replace(" ","").split(",")
battery_keywords = battery_keywords_string.lower().replace(" ","").split(",")
design_keywords = design_keywords_string.lower().replace(" ","").split(",")
beam_keywords = beam_keywords_string.lower().replace(" ","").split(",")
price_keywords = price_keywords_string.lower().replace(" ","").split(",")


### Helper functions

In [10]:
# checkPresence takes in:
# a sentence: represented by a string
# keywords: a list of keywords
# returns True if the sentence contains any of the keywords
def checkPresence(sentence, keywords):
    for keyword in keywords:
        if keyword in word_tokenize(sentence):
            return True
    return False

# checkPresence takes in:
# reviews_in_sentences: a list of list of sentences
#  (A review is represented by a list of sentences)
# keywords: a list of keywords
# returns: a list of filtered review which contains the keywords.
#         an empty string for a review that contains no keyword.
def filteredReview(reviews_in_sentences, keywords):
    ret = []
    for sentences in reviews_in_sentences:
        filtered = ''
        for sentence in sentences:
            if checkPresence(sentence, keywords):
                filtered += sentence 
        ret.append(filtered)
    return ret


### Match sentences into aspects

Took 4 seconds to process 80 reviews. (Need 250 seconds for 5000 reviews)

In [11]:
start = time.time()
df['size'] = filteredReview(reviews_in_sentences, size_keywords)
df['quality'] = filteredReview(reviews_in_sentences, quality_keywords)
df['battery'] = filteredReview(reviews_in_sentences, battery_keywords)
df['design'] = filteredReview(reviews_in_sentences, design_keywords)
df['beam'] = filteredReview(reviews_in_sentences, beam_keywords)
df['price'] = filteredReview(reviews_in_sentences, price_keywords)
end = time.time()
print(f"Took {end - start} seconds to match sentences into aspects.")


Took 3.7895147800445557 seconds to match sentences into aspects.


In [12]:
df.head()

Unnamed: 0,reviews,size,quality,battery,design,beam,price
0,this thing lights up the night sky! bought it ...,,,,,this thing lights up the night sky!,
1,"I own three Streamlight weapon-mounted lights,...",i consider this flashlight to be a good size a...,,i bought this because i needed a rechargeable ...,i like the three intensity settings and the st...,"i own three streamlight weapon-mounted lights,...",
2,Great flashlight that is bright and lasts! Fir...,"it's well balanced in the hand, and not to hea...","as far as quality and durability, it has a sol...","depending on which you use, will depend on you...",it has three settings of power.but this is my ...,great flashlight that is bright and lasts!it h...,
3,I bought mine back in 2012 with the piggyback ...,,the only thing that keeps me from giving 5 sta...,"piggyback charge is very nice, and of course t...",normally the highest setting is too bright to ...,it is so bright that while testing it out the ...,
4,I do home inspections and use my flashlight 5-...,,,,"for this particular model, i also love that it...",the bright white light from the led bulb is in...,


In [13]:
df.to_csv('aspects_'+filename)