In [None]:
import pandas as pd
import numpy as np
import time
import math
import nltk
nltk.download('punkt')
import pdb
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import PatternAnalyzer
pa = Blobber(analyzer=PatternAnalyzer())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
size_keywords_string = "Size, small, tiny, petite, slim, compact, large, big, giant,\
huge, enormous, gigantic, bulky, colossal, massive, sizable, weight, heavy, lightweight, cumbersome"

quality_keywords_string = "Build, built, quality,durability, sturdy, sturdiness, durable, tough, scratch, scratch,\
coating, solid, cheaply, aluminum, steel, titanium, brass, copper,\
material, metal, rubber, plastic, nylon, bent, broke, faulty, shatter,\
waterproof, dustproof, corrosion, ingress, drop, shock, impact, resistance,\
screws, threads, knurling, anodized, flicker, housing, indestructible,\
wet, temperature, hot, heat, overheat, cold,  well"

battery_keywords_string = "Battery, batteries, rechargeable, charge, charging,\
recharge, USB, solar, runtime, hours, lifetime, dies, died, dead"

design_keywords_string = "Features, design, setting, settings, mode, modes,\
interface, programmable, memory, dim, roll, design, roll, upright, stand, tailstanding, \
strobe, sos, float, warranty, grip, rotate, rotating, head, hang, lantern, eco, adjustable,\
zoom, clip, lanyard, holster, indicator, easy to use, versatile, switch, twist,\
activation, clicky, click, magnetize, accessories, bezel"

beam_keywords_string = "Power, project, projects, far, illuminate, shine,\
focus, distance, range, feet, meters, beam, distance, visibility, throw, \
flood, lumens, bright, brightness, lens, optics, frosted, reflector, mule, LED, tint, \
colour, color, hotspot, spill, corona, lux, candelas, intensity, lights"

price_keywords_string = "Price, cost, costly, pricey, pricy, expensive, overpriced, reasonable,\
unreasonable, value, affordable, cheap, $, bargain, budget, cash, discount, money, sale"


size_keywords = size_keywords_string.lower().replace(" ","").split(",")
quality_keywords = quality_keywords_string.lower().replace(" ","").split(",")
battery_keywords = battery_keywords_string.lower().replace(" ","").split(",")
design_keywords = design_keywords_string.lower().replace(" ","").split(",")
beam_keywords = beam_keywords_string.lower().replace(" ","").split(",")
price_keywords = price_keywords_string.lower().replace(" ","").split(",")

In [None]:
# checkPresence takes in:
# a sentence: represented by a string
# keywords: a list of keywords
# returns True if the sentence contains any of the keywords
def checkPresence(sentence, keywords):
    for keyword in keywords:
        if keyword in word_tokenize(sentence):
            return True
    return False

# checkPresence takes in:
# reviews_in_sentences: a list of list of sentences
#  (A review is represented by a list of sentences)
# keywords: a list of keywords
# returns: a list of filtered review which contains the keywords.
#         an empty string for a review that contains no keyword.
def filteredReview(reviews_in_sentences, keywords):
    ret = []
    for sentences in reviews_in_sentences:
        filtered = ''
        for sentence in sentences:
            if checkPresence(sentence, keywords):
                filtered += sentence 
        ret.append(filtered)
    return ret


def find_sentiment(sentence):
    if sentence == '':
        return float("nan")
    else:
        return (pa(sentence).sentiment[0] > 0)*1


# Assign weightage based on verified purchase.
def scale(val, sent):
  # 1) If aspect not present return.
  # 2) If not verified purchase, scale down the sentiment value by half.
  # 3) Else sentiment value remains unchanged.
  if math.isnan(float(val)):
    return float("nan")
  if bool(sent) == False:
    if float(val) == 1:
      return 0.5
    else:
      return -0.5
  elif bool(sent) == True:
    if float(val) == 1:
      return 1
    else:
      return -1


def absa_one_product(csv_name):
    # load data
    filename = 'main_product/' + csv_name
    df = pd.read_csv(filename, dtype=str,na_filter = False)
    
    # get product info
    product_id = df['product_id'][0]
    product_name = df['product_name'][0]
    price = df['price'][0]
    review_count = df.shape[0]
    product_info = (product_id, product_name, price, review_count)
    verif = df['verified_purchase']
    
    # parse the sentence
    reviews_corpus = list(df['reviews'])
    # pdb.set_trace()
    reviews_in_sentences = [sent_tokenize(review.lower()) for review in reviews_corpus]

    # match sentences into aspects
    df = df[[]]
    df['size'] = filteredReview(reviews_in_sentences, size_keywords)
    df['quality'] = filteredReview(reviews_in_sentences, quality_keywords)
    df['battery'] = filteredReview(reviews_in_sentences, battery_keywords)
    df['design'] = filteredReview(reviews_in_sentences, design_keywords)
    df['beam'] = filteredReview(reviews_in_sentences, beam_keywords)
    df['price'] = filteredReview(reviews_in_sentences, price_keywords)
    
    # perform sentiment analysis
    df = df.applymap(find_sentiment)

    # Weighing verified purchases:
    # https://towardsdatascience.com/apply-and-lambda-usage-in-pandas-b13a1ea037f7
    df['verified'] = verif
    cnt = 0
    for i in verif:
      if i == False:
        cnt += 1

    df['size2'] = df.apply(lambda row: scale(row['size'], row['verified']) ,axis=1)
    df['quality2'] = df.apply(lambda row: scale(row['quality'], row['verified']) ,axis=1)
    df['battery2'] = df.apply(lambda row: scale(row['battery'], row['verified']) ,axis=1)
    df['design2'] = df.apply(lambda row: scale(row['design'], row['verified']) ,axis=1)
    df['beam2'] = df.apply(lambda row: scale(row['beam'], row['verified']) ,axis=1)
    df['price2'] = df.apply(lambda row: scale(row['price'], row['verified']) ,axis=1)
    # df['verified']
    
    df.drop(columns='verified')

    # take the averages
    df = df.mean(axis = 0).to_frame().T
    #print(df.shape)
    df['product_id'] = product_id
    df['product_name'] = product_name
    df['sale_price'] = price
    df['review count'] = review_count
    df['not verified'] = cnt
    #print(df.shape)

    # print(verified)
    
    return df

In [None]:
import glob

In [None]:
 df = pd.DataFrame()
# Run absa for all products
i = 1
for csv_name in glob.glob('main_product/*.csv'):
    print(f'Running product {i}')
    csv_name = csv_name[13:]
    start = time.time()
    ret = absa_one_product(csv_name)
    df= df.append(ret)
    end = time.time()
    print(f"Finished running prouct {i}, took {round(end - start,1)} seconds.")
    i += 1

Running product 1
Finished running prouct 1, took 1.0 seconds.
Running product 2
Finished running prouct 2, took 1.9 seconds.
Running product 3
Finished running prouct 3, took 96.0 seconds.
Running product 4
Finished running prouct 4, took 1.0 seconds.
Running product 5
Finished running prouct 5, took 4.9 seconds.
Running product 6
Finished running prouct 6, took 2.9 seconds.
Running product 7
Finished running prouct 7, took 75.7 seconds.
Running product 8
Finished running prouct 8, took 28.2 seconds.
Running product 9
Finished running prouct 9, took 14.8 seconds.
Running product 10
Finished running prouct 10, took 17.5 seconds.
Running product 11
Finished running prouct 11, took 7.2 seconds.
Running product 12
Finished running prouct 12, took 3.9 seconds.
Running product 13
Finished running prouct 13, took 1.3 seconds.
Running product 14
Finished running prouct 14, took 4.7 seconds.
Running product 15
Finished running prouct 15, took 30.0 seconds.
Running product 16
Finished running p

In [None]:
df = df.set_index('product_id')
df

Unnamed: 0_level_0,size,quality,battery,design,beam,price,size2,quality2,battery2,design2,beam2,price2,product_name,sale_price,review count,not verified
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
B07K2MXXTV,,1.000000,0.333333,1.000000,1.000000,,,1.000000,-0.333333,1.000000,1.000000,,Acebeam H30 XHP70.2 LED Rechargeable Headlamp ...,167.86,6,0
B07K1XQ8M2,0.750000,0.875000,0.777778,0.857143,1.000000,,0.500000,0.750000,0.555556,0.714286,1.000000,,NITECORE NU32 550 Lumen LED Rechargeable Headl...,59.99,27,0
B00I14HLLS,0.759259,0.698745,0.759843,0.766102,0.884910,0.7840,0.518519,0.397490,0.519685,0.532203,0.769821,0.568,Fenix E12 Flashlight 130 Lumens,75.01,881,0
B07HFCRDMW,1.000000,1.000000,1.000000,0.000000,1.000000,1.0000,1.000000,1.000000,1.000000,-1.000000,1.000000,1.000,ACEBEAM H30 LED Headlamp Rechargeable 3-Color ...,176.55,16,0
B00M94WSVY,0.800000,1.000000,0.666667,0.777778,0.722222,1.0000,0.600000,1.000000,0.333333,0.555556,0.444444,1.000,Nitecore NITEU No-P10 Cree XM-L2 T6 800-Lumen ...,84.99,41,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B074TH86J2,,1.000000,1.000000,0.000000,0.666667,0.0000,,1.000000,1.000000,-1.000000,0.333333,-1.000,Acebeam X80 CREE XPE2-R2 630nm Flashlight/Sear...,,7,0
B00SRDTBF2,1.000000,0.760000,0.575758,0.857143,0.935484,0.8000,1.000000,0.520000,0.151515,0.714286,0.870968,0.600,NiteCore MH10 USB Rechargeable Flashlight CREE...,,73,0
B075TYB67V,0.840000,0.836364,0.553571,0.642857,0.795455,0.7500,0.680000,0.672727,0.107143,0.285714,0.590909,0.500,ACEBEAM L30 Gen II Tactical Flashlight 4000 Lu...,,241,0
B06XRF2C81,1.000000,1.000000,0.888889,1.000000,1.000000,0.8000,1.000000,1.000000,0.777778,1.000000,1.000000,0.600,AceBeam X45 Flashlight 18000 Lumens 6500K LED ...,,30,0


In [None]:
df.to_csv('summary.csv', index=False)