# Aspect Based Sentiment Analysis (ABSA) Rule Based Approach

### 1. Imports and Directory Setup

In [1]:
# Unzipping the review data.

!unzip LED_Flashlight.zip

unzip:  cannot find or open LED_Flashlight.zip, LED_Flashlight.zip.zip or LED_Flashlight.zip.ZIP.


In [None]:
# Creating directory for output.

!mkdir output

In [None]:
import pandas as pd
import numpy as np
import warnings
import glob

In [None]:
!pip install spacy

In [None]:
import nltk
import spacy

nltk.download('punkt')
nltk.download('stopwords')
from tqdm.auto import trange

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import re

stop = set(stopwords.words('english')) 
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Reading all the files in the input directory.

temp = np.array([1])
for name in glob.glob('main_product/*csv'):
  df = pd.read_csv(name)
  names = np.array(df['reviews'])
  temp = np.concatenate((temp, names), axis = None)

temp = temp[1:]

### 2. Text Preprocessing

Steps:<br>
a) Lower case the data.<br>
b) Remove links and other punctuations.

In [None]:
def preprocess(text_data):
  raw = text_data
  low = [i.lower() for i in raw]
  reviews = list()
  for i in low:
    i = re.sub(r'(https|http)?:\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', i)
    i = re.sub(r'www\.\S+\.com','', i)
    i = re.sub(r'[^a-z0-9 ]', ' ', i)
    reviews.append(i)
  return reviews

3. POS Tagging.

POS tagging identifies aspects (nouns) apart from the ones mentioned in 'corpus.txt' (handled below).

In [None]:
# The reason behind considering only “NOUN“ is that a noun is a word that functions as the name of some specific thing or set of things.
# So, we will get the relevant words based on the domain from the NOUN.
def pos_tagging(reviews):
    print("pos tagging")
    req_tag = ['NN']
    extracted_words = []
    i = 0
    x = ""
    try:
        for index in trange(len(reviews)):
            x = reviews[index]
            doc = nlp(x)
            for token in doc:
                i += 1
                if token.tag_ in req_tag and token.shape_ != 'x' and token.shape_ != 'xx' and token.shape_ != 'xxx':
                  if token.lemma_ in extracted_words:
                    continue
                  else:
                    extracted_words.append(token.lemma_)
        return extracted_words
    except Exception as e:
        print("here for" + str(i) + str(e))
        return extracted_words

In [None]:
rev = preprocess(temp)
extract_words_all = pos_tagging(rev)

pos tagging


HBox(children=(FloatProgress(value=0.0, max=17808.0), HTML(value='')))




Extracted aspects (extract_words_all) are added into a set (extract_words) to remove duplicates if any.

In [None]:
extract_words = set()
for i in trange(len(extract_words_all)):
    extract_words.add(i)

print(len(extract_words), len(extract_words_all))

HBox(children=(FloatProgress(value=0.0, max=5481.0), HTML(value='')))


5481 5481


Aspects from 'corpus.txt' are added into the set. This step may be skipped depending upon the situation. It allows us to add custom aspects that may or may not have been picked up by the POS tagging.

In [None]:
f = open('corpus.txt')
l = ""
for i in f:
  l = str(i)
f.close()

In [None]:
cor = l.split(", ")

In [None]:
corpora = [w.lower() for w in cor]

In [None]:
extract_words.add('grip')
extract_words.add('battery')
extract_words.add('life')
extract_words.add('runtime')
extract_words.add('corona')
extract_words.add('pattern')
extract_words.add('warranty')
extract_words.add('float')
extract_words.add('brightness')
extract_words.add('setting')
extract_words.add('mode')
extract_words.add('charging')
extract_words.add('durable')
extract_words.add('light')

In [None]:
for w in corpora:
  extract_words.add(w)

Reading 'pos.txt' and 'neg.txt' to input a list of positive and negative adjectives.

In [None]:
# This function reads text files line by line.
def readFile(fileName):
  f = open(fileName, "r")
  arr = list()
  for i in f:
    st = re.sub(r"\[|\]|\,|\'|\n", '', i)
    arr.append(st)
  f.close()
  return arr

In [2]:
# Reading the positive and negative adjective lists.
pos = readFile('pos.txt')
neg = readFile('neg.txt')

### 4. The following method performs the ABSA. Please read the code comments for further details.

In [None]:
# def getSentiment(sentence, pos, neg):
def getSentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature
              did not exist previously,then updates sentiment to
              each of the new or existing features
    output: updated dictionary
    '''
    sent_dict = dict()
    sentence = nlp(sentence)
    opinion_words = neg + pos
    debug = 0
    for token in sentence:
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                continue
            elif (token.dep_ == "amod"):
                sent_dict[token.head.text] = sentiment
            # for opinion words that are adjectives, adverbs, verbs...
            else:
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1
                for child in token.children:
                    # if verb, check if there's a direct object
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):                        
                        sent_dict[child.text] = sentiment
                        # check for conjugates (a AND b), then add both to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj=1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] = sentiment

                # check for negation
                for child in token.head.children:
                    noun = ""
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    # if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                    if (child.text in extract_words) and (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] = sentiment
                    debug += 1
    return sent_dict

Read each file in input, perform the analysis and save the answer in output folder.

In [None]:
for name in glob.glob('main_product/*csv'):
  print(name)
  ans = pd.DataFrame()
  df = pd.read_csv(name)
  rev = np.array(df['reviews'])
  reviews = preprocess(rev)
  asba = [getSentiment(i) for i in reviews]
  df['asba'] = asba
  final = 'output/' + name[13:]
  df.to_csv(final, index=False)

main_product/B000LJWV4S.csv
main_product/B07G3SJPLZ.csv
main_product/B07TZ3BMKW.csv
main_product/B00GZYNX8G.csv
main_product/B06VTLLC13.csv
main_product/B010ESCLHW.csv
main_product/B07JCFY3N4.csv
main_product/B07TJ41TMC.csv
main_product/B00OYKXTDW.csv
main_product/B07568DFCH.csv
main_product/B002XTCAXG.csv
main_product/B07DQKV38W.csv
main_product/B07X5TTTJT.csv
main_product/B01J8B219O.csv


In [None]:
# Zip the output files into a final zip.
!zip -r output.zip output

  adding: output/ (stored 0%)
  adding: output/B000LJWV4S.csv (deflated 77%)
  adding: output/B07G3SJPLZ.csv (deflated 79%)
  adding: output/B07TZ3BMKW.csv (deflated 80%)
  adding: output/B00GZYNX8G.csv (deflated 73%)
  adding: output/B06VTLLC13.csv (deflated 77%)
  adding: output/B010ESCLHW.csv (deflated 72%)
  adding: output/B07JCFY3N4.csv (deflated 77%)
  adding: output/B07TJ41TMC.csv (deflated 76%)
  adding: output/B00OYKXTDW.csv (deflated 73%)
  adding: output/B07568DFCH.csv (deflated 80%)
  adding: output/B002XTCAXG.csv (deflated 75%)
  adding: output/B07DQKV38W.csv (deflated 77%)
  adding: output/B07X5TTTJT.csv (deflated 83%)
  adding: output/B01J8B219O.csv (deflated 81%)
