In [1]:
!unzip asba.zip
!unzip stanford-tagger-4.1.0.zip

Archive:  asba.zip
965a97fbfd933d0b7e94885da6d6cfaa5a0e714d
   creating: Aspect-Based-Sentiment-Analysis-master/
   creating: Aspect-Based-Sentiment-Analysis-master/.ipynb_checkpoints/
  inflating: Aspect-Based-Sentiment-Analysis-master/.ipynb_checkpoints/Aspect Based Sentiment Analysis-checkpoint.ipynb  
  inflating: Aspect-Based-Sentiment-Analysis-master/Aspect Based Sentiment Analysis.ipynb  
  inflating: Aspect-Based-Sentiment-Analysis-master/README.md  
  inflating: Aspect-Based-Sentiment-Analysis-master/tagged_text_list_test.pkl  
  inflating: Aspect-Based-Sentiment-Analysis-master/tagged_text_list_train.pkl  
Archive:  stanford-tagger-4.1.0.zip
   creating: stanford-postagger-full-2020-08-06/
  inflating: stanford-postagger-full-2020-08-06/stanford-postagger-4.1.0-sources.jar  
  inflating: stanford-postagger-full-2020-08-06/TaggerDemo2.java  
  inflating: stanford-postagger-full-2020-08-06/TaggerDemo.java  
  inflating: stanford-postagger-full-2020-08-06/build.xml  
   creating

In [2]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import xml.etree.ElementTree as ET
from lxml import etree
from scipy.sparse import hstack
import numpy as np
import warnings


path_train = r'ABSA16_Laptops_Train_English_SB2.xml'
path_test = r'EN_LAPT_SB2_TEST.xml'

#For stanford POS Tagger
# home = r'C:\Users\THe_strOX\Anaconda3\stanford-postagger-full-2017-06-09'
home = r'stanford-postagger-full-2020-08-06'
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag
from nltk import word_tokenize
_path_to_model = home + '/models/english-bidirectional-distsim.tagger' 
_path_to_jar = home + '/stanford-postagger.jar'
stanford_tag = POS_Tag(model_filename=_path_to_model, path_to_jar=_path_to_jar)

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


In [3]:
#xml parser
def get_list(path):
    tree=ET.parse(path)
    root = tree.getroot()
    text_list = []
    opinion_list = []
    for review in root.findall('Review'):
        text_string=""
        opinion_inner_list=[]
        for sent in review.findall('./sentences/sentence'):
            text_string= text_string+ " "+ sent.find('text').text
        text_list.append(text_string)
        for opinion in review.findall('./Opinions/Opinion'):
            opinion_dict = {
                opinion.get('category').replace('#','_'): opinion.get('polarity')
            }
            opinion_inner_list.append(opinion_dict)
        opinion_list.append(opinion_inner_list)
    return text_list,opinion_list

In [4]:
#Selecting only 20 most common aspect.
def get_most_common_aspect(opinion_list):
    import nltk
    opinion= []
    for inner_list in opinion_list:
        for _dict in inner_list:
            for key in _dict:
                opinion.append(key)
    most_common_aspect = [k for k,v in nltk.FreqDist(opinion).most_common(20)]
    return most_common_aspect

In [5]:
#generate data frame
def get_data_frame(text_list,opinion_list,most_common_aspect):
    data={'Review':text_list}
    df = pd.DataFrame(data)
    if opinion_list:
        for inner_list in opinion_list:
            for _dict in inner_list:
                for key in _dict:
                    if key in most_common_aspect:
                        df.loc[opinion_list.index(inner_list),key]=_dict[key]
    return df

In [6]:
#generate data frame for aspect extraction task
def get_aspect_data_frame(df,most_common_aspect):
    for common_aspect in most_common_aspect:
        df[common_aspect]=df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])
    df = df.fillna(0)
    return df

In [7]:
def get_positive_data_frame(df,most_common_aspect):
    for common_aspect in most_common_aspect:
        df[common_aspect]=df[common_aspect].replace(['positive'],[1])
        df[common_aspect]=df[common_aspect].replace(['negative','neutral','conflict'],[0,0,0])
    df = df.fillna(0)
    return df

In [8]:
def get_negative_data_frame(df,most_common_aspect):
    for common_aspect in most_common_aspect:
        df[common_aspect]=df[common_aspect].replace(['negative'],[1])
        df[common_aspect]=df[common_aspect].replace(['positive','neutral','conflict'],[0,0,0])
    df = df.fillna(0)
    return df

In [9]:
def get_neutral_data_frame(df,most_common_aspect):
    for common_aspect in most_common_aspect:
        df[common_aspect]=df[common_aspect].replace(['neutral','conflict'],[1,1])
        df[common_aspect]=df[common_aspect].replace(['negative','positive'],[0,0])
    df = df.fillna(0)
    return df

In [10]:
#To tag using stanford pos tagger
def posTag(review):
    tagged_text_list=[]
    for text in review:
        tagged_text_list.append(stanford_tag.tag(word_tokenize(text)))
    return tagged_text_list
#posTag("this is random text")

In [11]:
#Filter the word with tag- noun,adjective,verb,adverb
def filterTag(tagged_review):
    final_text_list=[]
    for text_list in tagged_review:
        final_text=[]
        for word,tag in text_list:
            if tag in ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']:
                final_text.append(word)
        final_text_list.append(' '.join(final_text))
    return final_text_list

In [12]:
def get_dict_aspect(y,most_common_aspect):
    position=[]
    for innerlist in y:
        position.append([i for i, j in enumerate(innerlist) if j == 1])
    sorted_common=sorted(most_common_aspect)
    dict_aspect=[]
    for innerlist in position:
        inner_dict={}
        for word in sorted_common:
            if sorted_common.index(word) in innerlist:
                inner_dict[word]= 5
            else:
                inner_dict[word]=0
        dict_aspect.append(inner_dict)
    return dict_aspect

In [13]:
#Stage 1:
#Making list to train
train_text_list,train_opinion_list = get_list(path_train)
most_common_aspect = get_most_common_aspect(train_opinion_list)

In [14]:
type(train_text_list)
text_data = np.array(train_text_list)
text_data.shape

(395,)

In [15]:
!pip install spacy



In [16]:
import nltk
import spacy

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop = set(stopwords.words('english')) 
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
raw = text_data
low = [i.lower() for i in raw]

In [18]:
import re

reviews = list()

for i in low:
    i = re.sub(r'(https|http)?:\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', i)
    i = re.sub(r'www\.\S+\.com','', i)
    i = re.sub(r'[^a-z0-9 ]', ' ', i)
    reviews.append(i)

In [19]:
train_opinion_list

[[{'LAPTOP_GENERAL': 'positive'},
  {'LAPTOP_OPERATION_PERFORMANCE': 'positive'},
  {'HARD_DISC_DESIGN_FEATURES': 'positive'},
  {'LAPTOP_QUALITY': 'positive'},
  {'DISPLAY_QUALITY': 'negative'}],
 [{'LAPTOP_DESIGN_FEATURES': 'positive'},
  {'KEYBOARD_GENERAL': 'positive'},
  {'LAPTOP_OPERATION_PERFORMANCE': 'positive'},
  {'LAPTOP_GENERAL': 'positive'},
  {'LAPTOP_USABILITY': 'positive'},
  {'LAPTOP_QUALITY': 'positive'},
  {'LAPTOP_PRICE': 'positive'}],
 [{'LAPTOP_GENERAL': 'positive'},
  {'LAPTOP_DESIGN_FEATURES': 'positive'},
  {'COMPANY_GENERAL': 'positive'},
  {'LAPTOP_USABILITY': 'negative'}],
 [{'LAPTOP_GENERAL': 'negative'},
  {'LAPTOP_PRICE': 'negative'},
  {'SUPPORT_QUALITY': 'negative'},
  {'COMPANY_GENERAL': 'negative'}],
 [{'LAPTOP_GENERAL': 'positive'},
  {'LAPTOP_PRICE': 'negative'},
  {'LAPTOP_USABILITY': 'neutral'}],
 [{'LAPTOP_PRICE': 'positive'},
  {'LAPTOP_DESIGN_FEATURES': 'conflict'},
  {'LAPTOP_OPERATION_PERFORMANCE': 'positive'},
  {'LAPTOP_GENERAL': 'positive'

In [20]:
import re

reviews = list()

for i in low:
    i = re.sub(r'(https|http)?:\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', i)
    i = re.sub(r'www\.\S+\.com','', i)
    i = re.sub(r'[^a-z0-9 ]', ' ', i)
    reviews.append(i)

In [21]:
def readFile(fileName):
  f = open(fileName, "r")
  arr = list()
  for i in f:
    st = re.sub(r"\[|\]|\,|\'|\n", '', i)
    arr.append(st)
  f.close()
  return arr

In [22]:
pos = readFile('pos.txt')
neg = readFile('neg.txt')

In [23]:
from tqdm.auto import trange
def pos_tagging():
    print("pos tagging")
    req_tag = ['NN']
    extracted_words = []
    i = 0
    x = ""
    try:
        for index in trange(len(reviews)):
            x = reviews[index]
            doc = nlp(x)
            for token in doc:
                i += 1
                if token.tag_ in req_tag and token.shape_ != 'x' and token.shape_ != 'xx' and token.shape_ != 'xxx':
                  if token.lemma_ in extracted_words:
                    continue
                  else:
                    extracted_words.append(token.lemma_)
        return extracted_words
    except Exception as e:
        print("here for" + str(i) + str(e))
        return extracted_words
        
extract_words_all = pos_tagging()

pos tagging


HBox(children=(FloatProgress(value=0.0, max=395.0), HTML(value='')))




In [24]:
extract_words = set()
for i in trange(len(extract_words_all)):
    extract_words.add(i)

print(len(extract_words), len(extract_words_all))

HBox(children=(FloatProgress(value=0.0, max=970.0), HTML(value='')))


970 970


In [25]:
extract_words.add('graphics')
extract_words.add('design')
extract_words.add('portability')
extract_words.add('laptop')

In [26]:
# def getSentiment(sentence, pos, neg):
def getSentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature
              did not exist previously,then updates sentiment to
              each of the new or existing features
    output: updated dictionary
    '''
    sent_dict = dict()
    sentence = nlp(sentence)
    opinion_words = neg + pos
    debug = 0
    for token in sentence:
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                continue
            elif (token.dep_ == "amod"):
                sent_dict[token.head.text] = sentiment
            # for opinion words that are adjectives, adverbs, verbs...
            else:
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1
                for child in token.children:
                    # if verb, check if there's a direct object
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):                        
                        sent_dict[child.text] = sentiment
                        # check for conjugates (a AND b), then add both to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj=1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] = sentiment

                # check for negation
                for child in token.head.children:
                    noun = ""
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    # if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                    if (child.text in extract_words) and (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] = sentiment
                    debug += 1
    return sent_dict

In [27]:
asba = list()
for i in trange(len(reviews)):
  asba.append(getSentiment(reviews[i]))

HBox(children=(FloatProgress(value=0.0, max=395.0), HTML(value='')))




In [28]:
ans = pd.DataFrame(columns=['review', 'rule_based_sentiment'])

In [29]:
ans['review'] = reviews
ans['rule_based_sentiment'] = asba

In [30]:
ans['actual'] = train_opinion_list

In [31]:
ans.to_csv('test.csv')

In [32]:
len(extract_words)

974