# Loading Items

In [1]:
import spacy
import unidecode
import re
import pandas as pd
from collections import Counter

# from itertools import chain
from collections import defaultdict

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
with open("../tests/sample_text/text1.txt") as f:
    sample_text = f.read()

In [4]:
sample_text

'Oné of my favorite places to pick something up in Chicago. If you\'re a coffee or tea drinker, you need to stop in take a look at Coffee & Tea Exchange. I\'m so happy this place still remains the same. Barrels of coffee where you can see and smell varietals from all over the world -Peru, Brazil, US or the elusive Blue Mountain ($99/lb) can be found here. ~`#$%^&*()_+=\\{\\}\\[\\];:"<>@-/,\n\nI used to buy green coffee beans to roast at home. It\'s actually really fun to home roast with a pot and then shaking the skins outside with a colander for the freshest roasted bean you can get! farmer market\n\nJars of black, white,  bakery green and herbal tea line the back walls of the counter. The staff always encourages you to sniff as many as you like before you purchase. Most loose leaf teas are roughly $3/oz which gets you several servings, they also have the flowering teas that will be a premium price. The best part is you can find a wide range of flavors and  getting a small bag to try 

In [5]:
sample_text_doc = nlp(sample_text)
sample_text_doc

Oné of my favorite places to pick something up in Chicago. If you're a coffee or tea drinker, you need to stop in take a look at Coffee & Tea Exchange. I'm so happy this place still remains the same. Barrels of coffee where you can see and smell varietals from all over the world -Peru, Brazil, US or the elusive Blue Mountain ($99/lb) can be found here. ~`#$%^&*()_+=\{\}\[\];:"<>@-/,

I used to buy green coffee beans to roast at home. It's actually really fun to home roast with a pot and then shaking the skins outside with a colander for the freshest roasted bean you can get! farmer market

Jars of black, white,  bakery green and herbal tea line the back walls of the counter. The staff always encourages you to sniff as many as you like before you purchase. Most loose leaf teas are roughly $3/oz which gets you several servings, they also have the flowering teas that will be a premium price. The best part is you can find a wide range of flavors and  getting a small bag to try is such an a

In [6]:
unidecode.unidecode(sample_text)

'One of my favorite places to pick something up in Chicago. If you\'re a coffee or tea drinker, you need to stop in take a look at Coffee & Tea Exchange. I\'m so happy this place still remains the same. Barrels of coffee where you can see and smell varietals from all over the world -Peru, Brazil, US or the elusive Blue Mountain ($99/lb) can be found here. ~`#$%^&*()_+=\\{\\}\\[\\];:"<>@-/,\n\nI used to buy green coffee beans to roast at home. It\'s actually really fun to home roast with a pot and then shaking the skins outside with a colander for the freshest roasted bean you can get! farmer market\n\nJars of black, white,  bakery green and herbal tea line the back walls of the counter. The staff always encourages you to sniff as many as you like before you purchase. Most loose leaf teas are roughly $3/oz which gets you several servings, they also have the flowering teas that will be a premium price. The best part is you can find a wide range of flavors and  getting a small bag to try 

In [7]:
def normalize_text(text):
    """
        Normalize the text by:
            lowercasing
            removing diacritics
            replacing certain characters
            removing duplicate spaces and delimiters
            lematizing words
    """
        
        # this is missing characters: ',.?!|/
    rm_char = r'[~`#$%^&*()_+=\[\]{}\\;:<>@-]|\d'
    text = re.sub(rm_char, ' ', text) 
        
        # removes duplicate spaces
    text = ' '.join(text.split())
        
    text = text.lower()
    text = unidecode.unidecode(text)
        
        # convert to spacy text
    spacy_text = nlp(text)
        # -PRON- refers to pronouns
    norm_text = " ".join ([token.lemma_ for token in spacy_text])
    return norm_text
        
class KeyWords:
    """
    Keywords to search in Yelp reviews
    """
    
    def __init__(self, word_list):
        """
        :param word_list: list of the keywords to search
        :type  word_list: list
        """
        self.word_list = word_list
        self.word_set = {}
    
    def normalize_words(self):
        """
        normalize each word
        """
        
        word_list = list (set(self.word_list))
        norm_word_set = set([normalize_text(word) for word in word_list])
        self.word_set = norm_word_set
        
class Review:
    """
    Class for storing information of a single review.
    
    """
    
    def __init__(self, raw_text):
        """
        :param raw_text: the unmodified review
        :type  raw_text: str
        :param normalized_text: text after normalization
        :type  normalized_text: str
        """
        
        self.raw_text = raw_text
        self.normalized_text = ''
        self.all_word_freq = {}
        self.key_word_freq = {}
        #self.rating = rating
        #self.time = time
        
    def count_word_freq(self, keywords):
        """
        Count the frequency of each individual word, split by spaces.
        Then count the frequency of each keyword.
        
        :param keywords:
        :type  keywords: set
        """
        
        # key_word_freq is not a subset of all_word_freq
        # this is because key_word_freq may include spaces in them
        # all_word_frequency does not
        self.all_word_freq = Counter(self.normalized_text.split())
        
        key_word_freq = {}
        for keyword in keywords:
            key_word_freq[keyword] = self.normalized_text.count(keyword)
        self.key_word_freq = key_word_freq

In [8]:
word_list = pd.read_csv('../word_lists/key_words.csv')
keywords = KeyWords(list (word_list['WORDS']))
keywords.normalize_words()
# keywords.word_set

In [9]:
review1 = Review(sample_text)
review1.normalized_text = normalize_text(review1.raw_text)
review1.count_word_freq(keywords.word_set)
# Counter(review1.normalized_text.split())
review1.normalized_text

'one of -PRON- favorite place to pick something up in chicago . if -PRON- be a coffee or tea drinker , -PRON- need to stop in take a look at coffee tea exchange . -PRON- be so happy this place still remain the same . barrel of coffee where -PRON- can see and smell varietal from all over the world peru , brazil , -PRON- or the elusive blue mountain /lb can be find here . " / , i use to buy green coffee bean to roast at home . -PRON- be actually really fun to home roast with a pot and then shake the skin outside with a colander for the fresh roasted bean -PRON- can get ! farmer market jar of black , white , bakery green and herbal tea line the back wall of the counter . the staff always encourage -PRON- to sniff as many as -PRON- like before -PRON- purchase . most loose leaf tea be roughly /oz which get -PRON- several serving , -PRON- also have the flower tea that will be a premium price . the good part be -PRON- can find a wide range of flavor and get a small bag to try be such an affor

In [10]:
type(review1)

__main__.Review

In [11]:
# review1.all_word_freq
# for token in sample_text_doc:
#     print(token.text)

In [12]:
class Business:
    """
    Class for containing the information of a single business
    
    """
    
    def __init__(self, name):
        """
        
        :param review_info: 
        :type  review_info: Review
        """
        
        self.name = name
        self.review_info = []
        self.all_word_freq = {}
        self.key_word_freq = {}
#         self.num_reviews = num_reviews
        # will be a dict
#         self.hours_opened = hours_opened
#         self.price_range = price_range
#         self.address = address

    def find_review_info(self, reviews, keywords=keywords):
        """
        Find the review data information.
        
        :param reviews: a set of strings, 
                        each of which is a review
        :type  reviews: set
        """

        all_reviews = []
        for review in reviews:
            review_ins = Review(review)
            review_ins.normalized_text = normalize_text(review_ins.raw_text)
            review_ins.count_word_freq(keywords.word_set)
            all_reviews.append(review_ins)
#         self.review_info = set(all_reviews)
        self.review_info = (all_reviews)
    
    def aggregate_word_freq(self):
        """
        Combine all the word count for each review
        """
        
        all_dict = defaultdict(list)
        for review_info in self.review_info:
            for key, value in review_info.all_word_freq.items():
                all_dict[key].append(value)
        
        for k, v in all_dict.items():
            all_dict[k] = sum(v)
            
        self.all_word_freq = all_dict
        
        
        keyword_dict = defaultdict(list)
        for review_info in self.review_info:
            for key, value in review_info.key_word_freq.items():
                keyword_dict[key].append(value)
        
        for k, v in keyword_dict.items():
            keyword_dict[k] = sum(v)
            
        self.key_word_freq = keyword_dict

In [13]:
with open("../tests/sample_text/text1.txt") as f:
    sample_text1 = f.read()

with open("../tests/sample_text/text2.txt") as f:
    sample_text2 = f.read()

In [14]:
sample_reviews = set({sample_text1, sample_text2})

In [15]:
# sample_reviews
business1 = Business("Cafe1")
business1.find_review_info(sample_reviews)

In [16]:

business1.aggregate_word_freq()
# business1.review_info[1].raw_text

In [17]:
# business1.review_info[0].all_word_freq

In [18]:
# business1.all_word_freq#['!']