#### Requirement file

In [1]:
# !pip install -r requirements.txt
!pip freeze > requirements.txt


#### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import time
import nltk
import pdb
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import PatternAnalyzer
pa = Blobber(analyzer=PatternAnalyzer())

#### Get spreadsheet names

In [3]:
!ls main_product/ > lights.txt
with open('lights.txt','r') as l:
    csv_names = [f.strip() for f in l.read().split('\n') if f != '']


### !!! May Need extra work on this!!!
#### Use Fliashlight corpus for matching sentences into aspects
https://docs.google.com/document/d/1ZiQVK4czqH0UGWZM1XaEhribQUT0xkI71Xh_aCGtw1E/edit


In [4]:
size_keywords_string = "Size, small, tiny, petite, slim, compact, large, big, giant,\
huge, enormous, gigantic, bulky, colossal, massive, sizable, weight, heavy, lightweight, cumbersome"

quality_keywords_string = "Build, built, quality,durability, sturdy, sturdiness, durable, tough, scratch, scratch,\
coating, solid, cheaply, aluminum, steel, titanium, brass, copper,\
material, metal, rubber, plastic, nylon, bent, broke, faulty, shatter,\
waterproof, dustproof, corrosion, ingress, drop, shock, impact, resistance,\
screws, threads, knurling, anodized, flicker, housing, indestructible,\
wet, temperature, hot, heat, overheat, cold,  well"

battery_keywords_string = "Battery, batteries, rechargeable, charge, charging,\
recharge, USB, solar, runtime, hours, lifetime, dies, died, dead"

design_keywords_string = "Features, design, setting, settings, mode, modes,\
interface, programmable, memory, dim, roll, design, roll, upright, stand, tailstanding, \
strobe, sos, float, warranty, grip, rotate, rotating, head, hang, lantern, eco, adjustable,\
zoom, clip, lanyard, holster, indicator, easy to use, versatile, switch, twist,\
activation, clicky, click, magnetize, accessories, bezel"

beam_keywords_string = "Power, project, projects, far, illuminate, shine,\
focus, distance, range, feet, meters, beam, distance, visibility, throw, \
flood, lumens, bright, brightness, lens, optics, frosted, reflector, mule, LED, tint, \
colour, color, hotspot, spill, corona, lux, candelas, intensity, lights"

price_keywords_string = "Price, cost, costly, pricey, pricy, expensive, overpriced, reasonable,\
unreasonable, value, affordable, cheap, $, bargain, budget, cash, discount, money, sale"


size_keywords = size_keywords_string.lower().replace(" ","").split(",")
quality_keywords = quality_keywords_string.lower().replace(" ","").split(",")
battery_keywords = battery_keywords_string.lower().replace(" ","").split(",")
design_keywords = design_keywords_string.lower().replace(" ","").split(",")
beam_keywords = beam_keywords_string.lower().replace(" ","").split(",")
price_keywords = price_keywords_string.lower().replace(" ","").split(",")

#### Helper function

In [5]:
# checkPresence takes in:
# a sentence: represented by a string
# keywords: a list of keywords
# returns True if the sentence contains any of the keywords
def checkPresence(sentence, keywords):
    for keyword in keywords:
        if keyword in word_tokenize(sentence):
            return True
    return False

# checkPresence takes in:
# reviews_in_sentences: a list of list of sentences
#  (A review is represented by a list of sentences)
# keywords: a list of keywords
# returns: a list of filtered review which contains the keywords.
#         an empty string for a review that contains no keyword.
def filteredReview(reviews_in_sentences, keywords):
    ret = []
    for sentences in reviews_in_sentences:
        filtered = ''
        for sentence in sentences:
            if checkPresence(sentence, keywords):
                filtered += sentence 
        ret.append(filtered)
    return ret


def find_sentiment(sentence):
    if sentence == '':
        return float("nan")
    else:
        return (pa(sentence).sentiment[0] > 0)*1


def absa_one_product(csv_name):
    # load data
    filename = 'main_product/' + csv_name
    df = pd.read_csv(filename, dtype=str,na_filter = False)
    
    # get product info
    product_id = df['product_id'][0]
    product_name = df['product_name'][0]
    price = df['price'][0]
    review_count = df.shape[0]
    product_info = (product_id, product_name, price, review_count)
    
    # parse the sentence
    reviews_corpus = list(df['reviews'])
    #pdb.set_trace()
    reviews_in_sentences = [sent_tokenize(review.lower()) for review in reviews_corpus]

    # match sentences into aspects
    df = df[[]]
    df['size'] = filteredReview(reviews_in_sentences, size_keywords)
    df['quality'] = filteredReview(reviews_in_sentences, quality_keywords)
    df['battery'] = filteredReview(reviews_in_sentences, battery_keywords)
    df['design'] = filteredReview(reviews_in_sentences, design_keywords)
    df['beam'] = filteredReview(reviews_in_sentences, beam_keywords)
    df['price'] = filteredReview(reviews_in_sentences, price_keywords)
    
    # perform sentiment analysis
    df = df.applymap(find_sentiment)
    
    # take the averages
    df = df.mean(axis = 0).to_frame().T
    #print(df.shape)
    df['product_id'] = product_id
    df['product_name'] = product_name
    df['sale_price'] = price
    df['review count'] = review_count
    #print(df.shape)
    
    return df

#### Run for all products

In [6]:
df = pd.DataFrame()
# Run absa for all products
for i, csv_name in enumerate(csv_names):
    print(f'Running prouct {i}')
    start = time.time()
    ret = absa_one_product(csv_name)
    df= df.append(ret)
    end = time.time()
    print(f"Finished running prouct {i}, took {round(end - start,1)} seconds.")

Running prouct 0
Finished running prouct 0, took 9.1 seconds.
Running prouct 1
Finished running prouct 1, took 19.9 seconds.
Running prouct 2
Finished running prouct 2, took 21.3 seconds.
Running prouct 3
Finished running prouct 3, took 7.3 seconds.
Running prouct 4
Finished running prouct 4, took 9.6 seconds.
Running prouct 5
Finished running prouct 5, took 4.7 seconds.
Running prouct 6
Finished running prouct 6, took 6.1 seconds.
Running prouct 7
Finished running prouct 7, took 9.2 seconds.
Running prouct 8
Finished running prouct 8, took 4.3 seconds.
Running prouct 9
Finished running prouct 9, took 2.7 seconds.
Running prouct 10
Finished running prouct 10, took 26.1 seconds.
Running prouct 11
Finished running prouct 11, took 52.3 seconds.
Running prouct 12
Finished running prouct 12, took 15.2 seconds.
Running prouct 13
Finished running prouct 13, took 14.4 seconds.
Running prouct 14
Finished running prouct 14, took 12.2 seconds.
Running prouct 15
Finished running prouct 15, took 9.

In [7]:
df = df.set_index('product_id')
df

Unnamed: 0_level_0,size,quality,battery,design,beam,price,product_name,sale_price,review count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B001NZO85O,0.571429,0.735849,0.611111,0.750000,0.804878,0.833333,Fenix Flashlight Headband (Fits Lights with 18...,36.47,139
B005CWRB44,0.893204,0.777778,0.608108,0.763889,0.842105,0.735294,Fenix Compact 140 Lumen Flashlight,81.71,269
B0062PVSGW,0.804124,0.755102,0.698795,0.840336,0.927711,0.737500,Fenix Pd32-R5 Cree Xp-G Led Flashlight,129.65,275
B0091TRPVI,0.878049,0.743590,0.878788,0.647059,0.857143,0.800000,Fenix E25 Flashlight-187 Lumens,118.45,99
B00937X7G0,0.700000,0.833333,0.680000,0.514286,0.714286,0.777778,Nitecore MT2A CREE XP-G R5 LED 280 Lumen Multi...,59.94,165
...,...,...,...,...,...,...,...,...,...
B0841RSDCR,0.833333,0.866667,0.727273,0.666667,0.869565,0.400000,Nitecore E4K 4400 Lumen high powered Flashligh...,110.00,38
B086PW9TTP,1.000000,1.000000,1.000000,0.000000,0.666667,0.500000,"ACEBEAM E10 LED Flashlight, 760 Lumens, Long T...",73.55,14
B087CG1YW6,0.600000,0.833333,0.692308,0.684211,0.911765,0.777778,Fenix PD40R v2 3000 Lumen Mechanical Rotary Sw...,,52
B08BTQ2T4C,0.909091,0.666667,0.923077,1.000000,0.956522,1.000000,Fenix E03R 260 Lumen Rechargeable EDC Keychain...,,31


In [8]:
df.to_csv('product_level_ABSA')