### Needed Imports

In [None]:
!pip install nltk
!pip install textblob
!pip install pymongo

In [1]:
import pymongo, nltk, re
from textblob import TextBlob, Word, Sentence
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from dataFields import field

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalyv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kalyv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\kalyv\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kalyv\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Database class used to manage the access to the database layer

In [3]:
class Database:
    '''
    This class is used to manage the data handling of a Mongo database
    '''
    def __init__(self,mongo_uri,db_name,col_name):
        '''
        Constructor:
            mongo_uri   : the mongoDB URI to connect to
            db_name     : the mongo database to use
            col_name    : the mongo collection of the database to use
        '''
        try:
            self.client = pymongo.MongoClient(mongo_uri)
            self.database = self.client[db_name]
            self.collection = self.database[col_name]
        except Exception as exc:
            raise exc
    def getDocument(self,id):
        '''
        Get one document based on id
        '''
        return self.collection.find({'_id':id})
    def changeDocument(self,id,property,value):
        '''
        Change the property of a document
        '''
        #If document exists, replace it else insert it
        self.collection.update_one({'_id':id},{"$set":{property:value}},False)
    def getAllDocuments(self):
        '''
        Get all documents from the database
        '''
        return self.collection.find()

### Connect to database

In [4]:
try:
    file = []
    with open("pass.txt", 'r') as f:
        file = [file.rstrip('\n') for file in f]
    password = file[0]
except:
    password = input("Give database password: ")
database = Database(f"mongodb+srv://scraper:{password}@scraper.vbkzf.mongodb.net/test","Cluster0","reviews")

### Perform pre-processing on the database reviews and save them on a new "PROCESSES_TEXT" field

In [5]:
stopWords = set(stopwords.words("english"))
stemmer = PorterStemmer()
for document in database.getAllDocuments():
    if not field.TEXT in document: continue
    #Convert to lower case:
    text = str(document[field.TEXT]).lower()
    #Remove links :
    text = re.sub('http\S+','',text)
    #Remove emails :
    text = re.sub('\S*@\S*\s?','',text)
    text = TextBlob(text)
    #Lemmatize and stem words :
    newText = []
    for sentence in text.sentences:
        #Remove special characters:
        sentence = Sentence(re.sub('[^A-Za-z0-9 ]+','',sentence.raw))
        #Remove stop words :
        sentence = Sentence(' '.join(set(sentence.words) - stopWords))
        #Apply lemmatization and stemming :
        newSentence = []
        for word in sentence.words:
            word = Word(word)
            word = word.correct()            #Correct the spelling of the word
            word = Word(word.lemmatize('v')) #Lemmatize verbs
            word = Word(word.lemmatize('n')) #Lemmatize Nouns
            word = Word(word.stem())         #Get the stem of the word
            word = word.correct()            #Correct any stemming issues
            newSentence.append(word)
        newText.append(' '.join(newSentence) + ".")

    database.changeDocument(document[field.ID],field.PROCESSED_TEXT,' '.join(newText))

### Create some classes to handle our local review data

In [8]:
class Review:
    REVIEW = 'review'
    POLARITY = 'polarity'

    def __init__(self,review,polarity):
        '''
        This class is used to manage a single review
        '''
        self.__review = {
            Review.REVIEW : review,
            Review.POLARITY : polarity
        }

    def getPolarity(self):
        '''
        Get the polarity of the review
        '''
        return self.__review[Review.POLARITY]

    def getText(self):
        '''
        Get the review text
        '''
        return self.__review[Review.REVIEW]

class Reviews:
    def __init__(self):
        '''
        This class is used to manage the reviews of the different establishments
        '''
        self.__reviews = {}

    def addReview(self,establishment:str,review:Review):
        '''
        Add a review to of a specific establishment
        '''
        if not establishment in self.__reviews:
            self.__reviews[establishment] = []
        self.__reviews[establishment].append(review)

    def getReviews(self,establishment:str = None):
        '''
        Get reviews of a specific establishment.
        If no establishment is provided, all reviews are returned.
        '''
        reviews = []
        for est in self.__reviews:
            if establishment != None and est != establishment:
                continue
            reviews.append(self.__reviews[est])
        return reviews

### Gather reviews from database and score them based on sentiment

In [9]:
reviews = Reviews()
for document in database.getAllDocuments():
    text = TextBlob(document[field.PROCESSED_TEXT])
    review = Review(document[field.TEXT], text.sentiment.polarity)
    reviews.addReview(document[field.POI_NAME],review)