In [1]:
import json
import nltk
from nltk.corpus import stopwords
import os 
import glob
from nltk.stem.porter import *
from collections import defaultdict
import string
import heapq
import numpy as np
from nltk import FreqDist

In [2]:
modelDataDir = "modelData/"
path="Reviews/"
utilits="utilits/"

In [3]:
class Review:
    def __init__(self):
        self.sentences = [] #list of objects of class Sentence
        self.reviewId = ""
        self.ratings = {} #true ratings provided by the user
        
    def __str__(self):
        retStr = ""
        for sentence in self.sentences:
            retStr += sentence.__str__() + '\n'
        retStr += "###"+self.reviewId+"###"+str(self.ratings)+"\n"
        return retStr

In [4]:
class Sentence:
    def __init__(self, wordList):
        self.wordFreqDict = FreqDist(wordList)#Dictionary of words in the sentence and corres. frequency
        self.assignedAspect = [] #list of aspects assigned to this sentence
    def __str__(self):
        return self.wordFreqDict.pformat(10000) + '##' + str(self.assignedAspect)

In [5]:
class ReadData:
    def __init__(self):
        self.aspectKeywords = {} #aspect name <--> keywords list
        self.stopWords = []
        self.wordFreq = {} #dict with of all words and their freq in the corpus
        self.lessFrequentWords=set() #words which have frequency<5 in the corpus
        self.allReviews = [] #list of Review objects from the whole corpus
        self.aspectSentences = defaultdict(list) #aspect to Sentences mapping
        
    def readAspectSeedWords(self):
        with open(utilits+"SeedWords.json") as fd:
            seedWords = json.load(fd)
            for aspect in seedWords["aspects"]:
                self.aspectKeywords[aspect["name"]] = aspect["keywords"]
                
    def readStopWords(self):
        with open(utilits+"stopwords.dat") as fd:
            for stopWord in fd:
                self.stopWords.append(stopWord.strip())
        for stopWord in stopwords.words('english'):
            if stopWord not in self.stopWords:
                self.stopWords.append(stopWord)
        #print(self.stopWords)

    def stemmingStopWRemoval(self, review, vocab):
        ''' Does Following things:
        1. Tokenize review into sentences, and then into words
        2. Remove stopwords, punctuation and stem each word
        3. Add words into vocab 
        4. Make Sentence objects and corresponding Review object
        '''
        reviewObj = Review()
        #copying ratings into reviewObj
        for key, value in review["ratings"].items():
            reviewObj.ratings[key] = value
        reviewObj.reviewId = str(review["id"])
        
        stemmer = PorterStemmer()
        reviewContent = review["text"]
        #TODO: Append title too!
        sentencesInReview = nltk.sent_tokenize(reviewContent)
        puncs = set(string.punctuation) #punctuation marks 
        for sentence in sentencesInReview:
            wordList=[]
            words = nltk.word_tokenize(sentence)
            for word in words:
                if not all(c.isdigit() or c in puncs for c in word):
                    word = word.lower()
                    if word not in self.stopWords:
                        word=stemmer.stem(word.lower())
                        vocab.append(word)
                        wordList.append(word)
            if wordList:
                sentenceObj=Sentence(wordList)
                reviewObj.sentences.append(sentenceObj)
        if reviewObj.sentences:
            self.allReviews.append(reviewObj)
            # print(reviewObj)

    def readReviewsFromJson(self):
        ''' Reads reviews frm the corpus, calls stemmingStopWRemoval
        and creates list of lessFrequentWords (frequency<5)
        '''
        vocab=[]
        i = 0
        for filename in glob.glob(os.path.join(path, '*.json')):
            with open(filename, 'r') as file:
                # print(file)
                for line in file:
                    try:
                        data=json.loads(line)
                        self.stemmingStopWRemoval(data,vocab)
                    except:
                        print('error')
                        continue
                    i+=1
                    if i > 1000:
                        break
            print(data)
            # for review in data:
            #     print(type(review))
            #     print(review)
            # self.stemmingStopWRemoval(data,vocab)
        self.wordFreq = FreqDist(vocab)
        for word,freq in self.wordFreq.items():
            if freq < 4:
                self.lessFrequentWords.add(word)
        for word in self.lessFrequentWords:
            del self.wordFreq[word]
        
        print("Less Frequent Words ",self.lessFrequentWords)
        print("Vocab ", self.wordFreq.pformat(10000))
                 
    def removeLessFreqWords(self):
        emptyReviews = set()
        for review in self.allReviews:
            emptySentences = set()
            for sentence in review.sentences:
                deleteWords = set()
                for word in sentence.wordFreqDict.keys():
                    if word in self.lessFrequentWords:
                        deleteWords.add(word)
                for word in deleteWords:
                    del sentence.wordFreqDict[word]
                if not sentence.wordFreqDict:
                    emptySentences.add(sentence)
            review.sentences[:] = [x for x in review.sentences if x not in emptySentences]
            if not review.sentences:
                emptyReviews.add(review)  
        self.allReviews[:] = [x for x in self.allReviews if x not in emptyReviews]

In [6]:
class BootStrap:
    def __init__(self, readDataObj):
        self.corpus = readDataObj
        #Aspect,Word -> freq matrix - frequency of word in that aspect
        self.aspectWordMat = defaultdict(lambda: defaultdict(int)) 
        #Aspect --> total count of words tagged in that aspect
        # = sum of all row elements in a row in aspectWordMat matrix
        self.aspectCount = defaultdict(int)
        #Word --> frequency of jth tagged word(in all aspects) 
        # = sum of all elems in a column in aspectWordMat matrix
        self.wordCount = defaultdict(int)
        
        #Top p words from the corpus related to each aspect to update aspect keyword list
        self.p=5
        self.iter=7
        
        #List of W matrix
        self.wList=[]
        #List of ratings Dictionary belonging to review class
        self.ratingsList=[]
        #List of Review IDs
        self.reviewIdList=[]
        
        '''def calcC1_C2_C3_C4(self):
            for aspect, sentence in self.corpus.aspectSentences.items():
                for sentence in sentences:
                    for word in self.corpus.wordFreq.keys() and not in sentence.wordFreqDict.keys():
                        self.aspectNotWordMat[aspect][word]+=1
                    for word,freq in sentence.wordFreqDict.items():
                        self.aspectWordMat[aspect][word]+=freq
        '''
        
    def assignAspect(self, sentence): #assigns aspects to sentence
        sentence.assignedAspect = []
        count = defaultdict(int) #count used for aspect assignment as in paper
        #print("IN ASSIGN ASPECT FUNCTION:",len(sentence.wordFreqDict))
        for word in sentence.wordFreqDict.keys():
            for aspect, keywords in self.corpus.aspectKeywords.items():
                if word in keywords:
                    count[aspect]+=1
        if count: #if count is not empty
            maxi = max(count.values())
            for aspect, cnt in count.items():
                if cnt==maxi:
                    sentence.assignedAspect.append(aspect)
        if(len(sentence.assignedAspect)==1): #if only 1 aspect assigned to it
            self.corpus.aspectSentences[sentence.assignedAspect[0]].append(sentence)
            
    def populateAspectWordMat(self):
        self.aspectWordMat.clear()
        for aspect, sentences in self.corpus.aspectSentences.items():
            for sentence in sentences:
                for word,freq in sentence.wordFreqDict.items():
                    self.aspectWordMat[aspect][word]+=freq
                    self.aspectCount[aspect]+=freq
                    self.wordCount[word]+=freq
    
    def chiSq(self, aspect, word):
        #Total number of (tagged) word occurrences
        C = sum(self.aspectCount.values())
        
        #Frequency of word W in sentences tagged with aspect Ai
        C1 = self.aspectWordMat[aspect][word]
        
        #Frequency of word W in sentences NOT tagged with aspect Ai
        C2 = self.wordCount[word]-C1
        
        #Number of sentences of aspect A, NOT contain W
        C3 = self.aspectCount[aspect]-C1 
        
        #Number of sentences of NOT aspect A, NOT contain W
        C4 = C-C1
        
        deno = (C1+C3)*(C2+C4)*(C1+C2)*(C3+C4)
        #print(aspect, word, C, C1, C2, C3, C4)
        if deno!=0:
            return (C*(C1*C4 - C2*C3)*(C1*C4 - C2*C3))/deno
        else:
            return 0.0
        
    def calcChiSq(self):
        topPwords = {}
        for aspect in self.corpus.aspectKeywords.keys():
            topPwords[aspect] = []
        for word in self.corpus.wordFreq.keys():
            maxChi = 0.0 #max chi-sq value for this word
            maxAspect = "" #corresponding aspect
            for aspect in self.corpus.aspectKeywords.keys():
                self.aspectWordMat[aspect][word] = self.chiSq(aspect,word)
                if self.aspectWordMat[aspect][word] > maxChi:
                    maxChi = self.aspectWordMat[aspect][word]
                    maxAspect = aspect
            if maxAspect!="":
                topPwords[maxAspect].append((maxChi, word))
                
        changed=False
        for aspect in self.corpus.aspectKeywords.keys():
            for t in heapq.nlargest(self.p,topPwords[aspect]):
                if t[1] not in self.corpus.aspectKeywords[aspect]:
                    changed=True
                    self.corpus.aspectKeywords[aspect].append(t[1])
        return changed
    
    # Populate wList,ratingsList and reviewIdList
    def populateLists(self):
        for review in self.corpus.allReviews:
            #Computing W matrix for each review
            W = defaultdict(lambda: defaultdict(int))
            for sentence in review.sentences:
                if len(sentence.assignedAspect)==1:
                    for word,freq in sentence.wordFreqDict.items():
                        W[sentence.assignedAspect[0]][word]+=freq
            if len(W)!=0:
                self.wList.append(W)
                self.ratingsList.append(review.ratings)
                self.reviewIdList.append(review.reviewId)  
                
        
    def bootStrap(self):
        changed=True
        while self.iter>0 and changed:
            self.iter-=1
            self.corpus.aspectSentences.clear()
            for review in self.corpus.allReviews:
                for sentence in review.sentences:
                    self.assignAspect(sentence)
            self.populateAspectWordMat()
            changed=self.calcChiSq()
        self.corpus.aspectSentences.clear()
        for review in self.corpus.allReviews:
            for sentence in review.sentences:
                self.assignAspect(sentence)
        print(self.corpus.aspectKeywords)
    
    # Saves the object into the given file
    def saveToFile(self,fileName,obj):
        with open(modelDataDir+fileName,'w') as fp:
            json.dump(obj,fp)
            fp.close()

In [7]:
rd = ReadData()
rd.readAspectSeedWords()
rd.readStopWords()
rd.readReviewsFromJson()
rd.removeLessFreqWords()      

{'ratings': {'service': 5.0, 'cleanliness': 5.0, 'overall': 5.0, 'value': 5.0, 'location': 5.0, 'sleep_quality': 5.0, 'rooms': 5.0}, 'title': '“Nice modern new rooms, comfy, clean, great location! Very nice stay”', 'text': 'Overall it is a pleasant stay at Westin. Staffs are quiet friendly and helpful. Rooms are modern, new and clean. The club lounge is also really nice with a view. I am very pleased with my stay.\nI do notice, if you do the club level rooms, it is on the 17th floor, ask for a room not below the gym!!!! or else you will hear weight dropping on your ceiling!!! \nParking is easy, skybridge over to the mall, it is a good value!', 'author': {'username': 'lammylammy', 'num_cities': 13, 'num_helpful_votes': 38, 'num_reviews': 23, 'num_type_reviews': 17, 'id': 'AAA3F46FCEE39DAB599CCC05BE9C63A8', 'location': 'Houston'}, 'date_stayed': 'March 2012', 'offering_id': 1966350, 'num_helpful_votes': 2, 'date': 'March 8, 2012', 'id': 125810806, 'via_mobile': False}
Less Frequent Words

In [8]:
bootstrapObj = BootStrap(rd)
bootstrapObj.bootStrap()
bootstrapObj.populateLists()
bootstrapObj.saveToFile("wList.json",bootstrapObj.wList)
bootstrapObj.saveToFile("ratingsList.json",bootstrapObj.ratingsList)
bootstrapObj.saveToFile("reviewIdList.json",bootstrapObj.reviewIdList)
bootstrapObj.saveToFile("vocab.json",list(bootstrapObj.corpus.wordFreq.keys()))
bootstrapObj.saveToFile("aspectKeywords.json",bootstrapObj.corpus.aspectKeywords)

{'service': ['service', 'manager'], 'cleanliness': ['clean', 'dirty', 'dust', 'maintain', 'towel', 'quiet', 'room', 'servic', 'nice', 'great', 'bed', 'view', 'excel', 'stay', 'well', 'spaciou', 'houston', 'definit', 'univers', 'i-45', 'como', 'rice', 'yeah'], 'value': ['value', 'price', 'reason', 'motel', 'bw', 'paid', 'time', 'squar', 'min', 'clock', 'ahead', '34th', 'herald', 'potato'], 'location': ['location', 'locate'], 'sleep_quality': ['peaceful', 'noise-free'], 'room': ['room', 'space', 'servic', 'view', 'nice', 'bed', 'excel', 'stay', 'great', 'well', 'spaciou', 'clean', 'time']}
