# Imports and Installs

## Installs

In [1]:
try:
    import spacy
except:
    ! pip install spacy

In [2]:
try:
    import textblob
except:
    ! pip install textblob

## Imports

In [3]:
import pandas as pd
import json
import urllib.request
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import spacy
import nltk
import re
from nltk.corpus import stopwords
from tabulate import tabulate

In [4]:
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns

# Download Data

## NLTK Downloads

In [5]:
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("popular")
    nltk.download("brown")
    nltk.download('stopwords')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

## Yelp Academic Dataset Review

https://github.com/knowitall/yelp-dataset-challenge/blob/master/data/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json

In [6]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/knowitall/yelp-dataset-challenge/master/data/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json", "yelp_academic_dataset_review.json")

('yelp_academic_dataset_review.json',
 <http.client.HTTPMessage at 0x7fc6dcdd9350>)

# ICE-4 Text Data: Flattening, Filtering, and Chunking

## (Tutorial) Bag of X
Following is a sample of applying bag of n-grams to Yelp academic dataset review, please download it with following link:

https://github.com/knowitall/yelp-dataset-challenge/blob/master/data/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json

In [7]:
f = open('yelp_academic_dataset_review.json')
js = []
for i in range(10000):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)
review_df.shape

(10000, 8)

In [8]:
review_df.head()

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw


note: in the default settings of CountVectorizer, the token_pattern = '(?u)\\b\\w\\w+\\b', which ignores single-character words. Whe employ the token_pattern = '(?u)\\b\\w+\\b' to include the single-character words. 

In [9]:
bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
x = bow_converter.fit_transform(review_df['text'])

In [10]:
unigram = bow_converter.get_feature_names()



In [11]:
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
x2 = bigram_converter.fit_transform(review_df['text'])

In [12]:
bigram = bigram_converter.get_feature_names()

In [13]:
trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\b\\w+\\b')
x3 = trigram_converter.fit_transform(review_df['text'])

In [14]:
trigram = trigram_converter.get_feature_names()

In [15]:
unigram

['0',
 '00',
 '000',
 '007',
 '00a',
 '00am',
 '00pm',
 '01',
 '02',
 '03',
 '03342',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '0buxoc0crqjpvkezo3bqog',
 '0l',
 '0tzg',
 '1',
 '10',
 '100',
 '1000',
 '1000x',
 '1001',
 '100lbs',
 '100s',
 '100th',
 '101',
 '102',
 '102729',
 '1030',
 '104',
 '105',
 '1070',
 '107f',
 '108',
 '109',
 '10am',
 '10ish',
 '10k',
 '10min',
 '10mins',
 '10minutes',
 '10oz',
 '10p',
 '10pm',
 '10th',
 '10x',
 '10yo',
 '11',
 '110',
 '1100',
 '111',
 '111th',
 '112',
 '113',
 '1130',
 '114',
 '1145',
 '115',
 '115th',
 '116',
 '117',
 '118',
 '11a',
 '11am',
 '11p',
 '11pm',
 '11th',
 '11year',
 '12',
 '120',
 '1200',
 '12000',
 '1202',
 '123',
 '124',
 '125',
 '128i',
 '129',
 '12a',
 '12am',
 '12k',
 '12oz',
 '12pm',
 '12th',
 '13',
 '130',
 '1300',
 '13331',
 '135',
 '13th',
 '13yr',
 '14',
 '140',
 '147',
 '149',
 '14lbs',
 '15',
 '150',
 '1500',
 '150k',
 '150mm',
 '157',
 '15am',
 '15ft',
 '15min',
 '15mins',
 '15pm',
 '15th',
 '16',
 '160',
 '1600',
 

In [16]:
bigram

['0 0',
 '0 20',
 '0 39',
 '0 5',
 '0 50',
 '0 6',
 '0 75',
 '0 90',
 '0 95',
 '0 99',
 '0 after',
 '0 eye',
 '0 inch',
 '0 or',
 '0 ph',
 '0 simply',
 '0 so',
 '0 star',
 '0 stars',
 '0 that',
 '00 25',
 '00 3',
 '00 6',
 '00 9',
 '00 a',
 '00 after',
 '00 am',
 '00 amazing',
 '00 and',
 '00 apiece',
 '00 appetizers',
 '00 at',
 '00 banana',
 '00 basket',
 '00 beam',
 '00 before',
 '00 bill',
 '00 brown',
 '00 bucks',
 '00 burger',
 '00 burgers',
 '00 but',
 '00 charge',
 '00 cheaper',
 '00 cover',
 '00 credit',
 '00 diagnostic',
 '00 did',
 '00 draft',
 '00 drink',
 '00 drinks',
 '00 each',
 '00 entree',
 '00 etc',
 '00 everyday',
 '00 extra',
 '00 food',
 '00 for',
 '00 game',
 '00 get',
 '00 gets',
 '00 gift',
 '00 great',
 '00 had',
 '00 happy',
 '00 held',
 '00 here',
 '00 i',
 '00 into',
 '00 is',
 '00 ish',
 '00 it',
 '00 jeans',
 '00 machine',
 '00 machines',
 '00 margaritas',
 '00 minimum',
 '00 my',
 '00 night',
 '00 no',
 '00 normally',
 '00 not',
 '00 of',
 '00 off',
 '00 

In [17]:
trigram

['0 0 eye',
 '0 20 less',
 '0 39 oz',
 '0 39 pizza',
 '0 5 i',
 '0 50 to',
 '0 6 can',
 '0 75 oysters',
 '0 75 that',
 '0 75 to',
 '0 90 well',
 '0 95 if',
 '0 99 for',
 '0 after rebates',
 '0 eye candy',
 '0 inch sub',
 '0 or 1',
 '0 ph version',
 '0 simply put',
 '0 so i',
 '0 star option',
 '0 stars for',
 '0 stars i',
 '0 stars just',
 '0 stars this',
 '0 that s',
 '00 25 00',
 '00 3 burger',
 '00 6 00',
 '00 9 00',
 '00 a burger',
 '00 a cupcake',
 '00 a day',
 '00 a m',
 '00 a night',
 '00 a piece',
 '00 a pitcher',
 '00 after i',
 '00 am before',
 '00 am every',
 '00 am like',
 '00 am on',
 '00 am or',
 '00 am they',
 '00 am where',
 '00 amazing appetizer',
 '00 and 4',
 '00 and dinner',
 '00 and it',
 '00 and that',
 '00 and the',
 '00 and there',
 '00 and they',
 '00 and was',
 '00 and you',
 '00 apiece however',
 '00 appetizers we',
 '00 at 9',
 '00 banana republic',
 '00 basket fries',
 '00 beam 5',
 '00 before tip',
 '00 bill they',
 '00 brown rice',
 '00 bucks for',
 '00 b

In [18]:
print (len(unigram), len(bigram), len(trigram))

29222 368943 881620


In [19]:
sns.set_style("darkgrid")
counts = [len(unigram), len(bigram), len(trigram)]
plt.plot(counts, color='cornflowerblue')
plt.plot(counts, 'bo')
plt.margins(0.1)
plt.xticks(range(3), ['unigram', 'bigram', 'trigram'])
plt.tick_params(labelsize=14)
plt.title('Number of ngrams in the first 10,000 reviews of the Yelp dataset', {'fontsize':16})
plt.show()

<IPython.core.display.Javascript object>

## Task 1. 1 Applying the unigram, bigram, and trigram tokenization methods to the given text below. 

In [20]:
train_text = """My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  
Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  
I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  
It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  
It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!"""

t_pattern = "(?u)\\b\\w+\\b"
train = []
for line in " ".join(train_text.split("\n")).split("."):
    if len(line) > 0:
        train.append(line.strip())
        if train[-1][-1].isalpha() or train[-1][-1].isdigit():
            train[-1] = train[-1] + "."
train = train_text.split("\n")

# Unigram
bow_converter = CountVectorizer(token_pattern=t_pattern)
x1 = bow_converter.fit_transform(train)
unigram = bow_converter.get_feature_names()

# Bigram
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern=t_pattern)
x2 = bigram_converter.fit_transform(train)
bigram = bigram_converter.get_feature_names()

# Trigram
trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern=t_pattern)
x3 = trigram_converter.fit_transform(train)
trigram = trigram_converter.get_feature_names()

print (len(unigram), len(bigram), len(trigram))

103 139 141


In [21]:
unigram

['2',
 'a',
 'absolute',
 'absolutely',
 'amazing',
 'an',
 'and',
 'anyway',
 'arrived',
 'back',
 'best',
 'better',
 'birthday',
 'blend',
 'bloody',
 'bread',
 'breakfast',
 'busy',
 'came',
 'can',
 'complete',
 'delicious',
 'do',
 'earlier',
 'eggs',
 'ever',
 'everything',
 'excellent',
 'favor',
 'fills',
 'food',
 'for',
 'fresh',
 'from',
 'garden',
 'get',
 'go',
 'griddled',
 'grounds',
 'had',
 'here',
 'i',
 'ingredients',
 'it',
 'like',
 'looked',
 'looks',
 'm',
 'made',
 'mary',
 'me',
 'meal',
 'menu',
 'morning',
 'my',
 'of',
 'on',
 'only',
 'order',
 'our',
 'outside',
 'overlooking',
 'perfect',
 'phenomenal',
 'pieces',
 'place',
 'pleasure',
 'pretty',
 'quickly',
 'saturday',
 'scrambled',
 'semi',
 'simply',
 'sitting',
 'skillet',
 'so',
 'sure',
 't',
 'tasty',
 'the',
 'their',
 'them',
 'they',
 'to',
 'toast',
 'took',
 'truffle',
 'up',
 'use',
 've',
 'vegetable',
 'wait',
 'waitress',
 'was',
 'weather',
 'when',
 'which',
 'while',
 'white',
 'wife

In [22]:
bigram

['2 pieces',
 'a favor',
 'absolute pleasure',
 'absolutely made',
 'amazing and',
 'an absolute',
 'and blend',
 'and delicious',
 'and get',
 'and it',
 'and our',
 'and simply',
 'anyway i',
 'arrived quickly',
 'best i',
 'best toast',
 'birthday for',
 'blend them',
 'bloody mary',
 'bread with',
 'breakfast and',
 'busy saturday',
 'came with',
 'can t',
 'do yourself',
 'earlier you',
 'eggs vegetable',
 'ever had',
 'everything on',
 'excellent and',
 'excellent i',
 'excellent the',
 'favor and',
 'fills up',
 'food arrived',
 'for breakfast',
 'fresh when',
 'from their',
 'garden and',
 'get here',
 'get their',
 'go back',
 'griddled bread',
 'grounds an',
 'had the',
 'here on',
 'here the',
 'i can',
 'i had',
 'i m',
 'i ve',
 'ingredients from',
 'it absolutely',
 'it came',
 'it it',
 'it looked',
 'it was',
 'like the',
 'looked like',
 'looks excellent',
 'm pretty',
 'made sitting',
 'made the',
 'mary it',
 'me here',
 'meal complete',
 'menu looks',
 'morning it',

In [23]:
trigram

['2 pieces of',
 'a favor and',
 'absolutely made the',
 'amazing and it',
 'an absolute pleasure',
 'and blend them',
 'and get their',
 'and it absolutely',
 'and it was',
 'and our food',
 'and simply the',
 'anyway i can',
 'arrived quickly on',
 'best i ve',
 'best toast i',
 'birthday for breakfast',
 'blend them fresh',
 'bloody mary it',
 'bread with was',
 'breakfast and it',
 'busy saturday morning',
 'came with 2',
 'can t wait',
 'do yourself a',
 'earlier you get',
 'eggs vegetable skillet',
 'everything on the',
 'excellent and our',
 'excellent i had',
 'excellent the weather',
 'favor and get',
 'fills up pretty',
 'food arrived quickly',
 'for breakfast and',
 'fresh when you',
 'from their garden',
 'garden and blend',
 'get here the',
 'get their bloody',
 'griddled bread with',
 'grounds an absolute',
 'had the white',
 'here on my',
 'here the better',
 'i can t',
 'i had the',
 'i m pretty',
 'i ve ever',
 'ingredients from their',
 'it absolutely made',
 'it came

## Task 1.2 Create your own naive tokenization method (whitespace-based), and apply it to the text given in the task 1.1
note: 1. do not use the existing togkenization methods given by NLP; 2. split the words by whitespace character, the output is more likely as the unigram; 3. no repeating elements in the output.

In [24]:
def custom_tokenize(text):
    text = re.sub("\s+", " ", text)
    text = text.split(" ")
    t = set(text)
    return list(t)

tok_train_text = custom_tokenize(train_text)
tok_train_text

['my',
 'like',
 'excellent,',
 'delicious.',
 'pretty',
 '2',
 'simply',
 'phenomenal',
 'weather',
 'tasty',
 'vegetable',
 'waitress',
 'them',
 'I',
 'sitting',
 'when',
 'white',
 'an',
 'to',
 'quickly',
 'semi-busy',
 'me',
 'it.',
 'Mary.',
 'griddled',
 'skillet',
 'The',
 'morning.',
 'from',
 'better.',
 'get',
 'food',
 'had.',
 'had',
 'for',
 'absolute',
 'excellent',
 'made',
 'bread',
 'came',
 'blend',
 'pleasure.',
 'overlooking',
 'wait',
 'absolutely',
 'eggs',
 '"toast"',
 'the',
 'meal',
 'grounds',
 'amazing',
 'Our',
 'best',
 "I'm",
 'ever',
 'our',
 'it',
 'Bloody',
 'on',
 'fresh',
 'outside',
 'a',
 'EVERYTHING',
 'complete.',
 'earlier',
 'with',
 'garden',
 'looked',
 'It',
 'ingredients',
 'sure',
 'Saturday',
 'which',
 "can't",
 'and',
 'truffle',
 'up',
 'only',
 'amazing.',
 'was',
 'use',
 'so',
 'you',
 'looks',
 'scrambled',
 'of',
 'birthday',
 'their',
 'excellent.',
 'My',
 'Anyway,',
 'menu',
 'pieces',
 'they',
 'took',
 'Do',
 'fills',
 'Whil

## **Question 1**. Given a sentence "He likes cat". In unigram representation, it could be "He", "likes", "cat". In bigram representation, it could be "He likes", "likes cat". In trigram representation, it could be "He likes cat". Explain why the storage and computation cost increase with the growth of n in n-gram methods. 

Answer to Q1: As n increases, the number of *possible* n-grams grows exponentially. This is because the number of possible n-grams consist of every combination of unigrams possible (e.g. he likes cat, he cat likes, likes he cat, likes cat he, cat he likes, cat likes he). Even though most of these n-grams don't occur, the probabilities for them need to be calculated and storage space needs to be assigned. Given that so many never occur, this is often wasted storage space and computational power. On top of this, looking at the n-grams in the text is more computationally expesive for larger n values because as n increases, we visit each token in a string more often (e.g. for "likes"; unigram: occurs in window "likes", bigram: occurs in windows "he likes" and "likes cats", trigram: occurs in all windows in previous e.g.). Each re-vist costs more computational power than smaller n-grams with fewer visits.

---

## (Tutorial) Stemming and Lemmatization

In [25]:
# import PorterStemmer class form nltk.stem.porter module
stemmer = PorterStemmer() 

stem = stemmer.stem('flowers')  
print(f"'flowers' after stemming: {stem}")

stem = stemmer.stem('zeroes')
print(f"'zeroes' after stemming: {stem}")

stem = stemmer.stem('better')
print(f"'better' after stemming: {stem}")

stem = stemmer.stem('sixties')
print(f"'sixties' after stemming: {stem}")

stem = stemmer.stem('goes')
print(f"'goes' after stemming: {stem}")

stem = stemmer.stem('go')
print(f"'go' after stemming: {stem}")

'flowers' after stemming: flower
'zeroes' after stemming: zero
'better' after stemming: better
'sixties' after stemming: sixti
'goes' after stemming: goe
'go' after stemming: go


In [26]:
# import lemmatizer class from nltk.stem module
lemmatizer = WordNetLemmatizer()  

lemma = lemmatizer.lemmatize('flowers')   
print(f"'flowers' after lemmatization: {lemma}")

lemma = lemmatizer.lemmatize('zeros')
print(f"'zeros' after lemmatization: {lemma}")

lemma = lemmatizer.lemmatize('better')
print(f"'better' after lemmatization: {lemma}")

lemma = lemmatizer.lemmatize('sixties')
print(f"'sixties' after lemmatization: {lemma}")

lemma = lemmatizer.lemmatize('goes')
print(f"'goes' after lemmatization: {lemma}")

lemma = lemmatizer.lemmatize('go')
print(f"'go' after lemmatization: {lemma}")

print("\n\n")
lemma = lemmatizer.lemmatize('better', pos='a')   # 'a' denoted ADJECTIVE part-of-speech
print(f"'better' (as an adjective) after lemmatization: {lemma}")

'flowers' after lemmatization: flower
'zeros' after lemmatization: zero
'better' after lemmatization: better
'sixties' after lemmatization: sixty
'goes' after lemmatization: go
'go' after lemmatization: go



'better' (as an adjective) after lemmatization: good


## Task 2. Text filtering for cleaner feature
1. clean the text used in the task 1; 2. remove all punctuations; 3. convert all characters to their lowercase; 4. remove all words in "stopwords"; 5. remove all relatively meaningless words like " 've ", " 's ", etc. 6. after finishing the above operations, apply stemming and lemmatization to the cleaned text respectively.

In [27]:
stop_words = stopwords.words('english')
tmp = set()
for i, tok in enumerate(tok_train_text):
    t = tok.lower()
    t = re.sub("[!@#$%^&*,.?;:'\"<>~`()-_=+\[\{\]\}\\\|]+", " ", t)
    t = t.strip()
    t = t.split(" ")
    for w in t:
        if w not in stop_words and w not in ["ve", "s", "m", "t"]:
            tmp.add(w)
strip_train_text = sorted(list(tmp))
strip_train_text

['',
 'absolute',
 'absolutely',
 'amazing',
 'anyway',
 'arrived',
 'back',
 'best',
 'better',
 'birthday',
 'blend',
 'bloody',
 'bread',
 'breakfast',
 'busy',
 'came',
 'complete',
 'delicious',
 'earlier',
 'eggs',
 'ever',
 'everything',
 'excellent',
 'favor',
 'fills',
 'food',
 'fresh',
 'garden',
 'get',
 'go',
 'griddled',
 'grounds',
 'ingredients',
 'like',
 'looked',
 'looks',
 'made',
 'mary',
 'meal',
 'menu',
 'morning',
 'order',
 'outside',
 'overlooking',
 'perfect',
 'phenomenal',
 'pieces',
 'place',
 'pleasure',
 'pretty',
 'quickly',
 'saturday',
 'scrambled',
 'semi',
 'simply',
 'sitting',
 'skillet',
 'sure',
 'tasty',
 'toast',
 'took',
 'truffle',
 'use',
 'vegetable',
 'wait',
 'waitress',
 'weather',
 'white',
 'wife']

In [28]:
stemmed_train_text = set()
for t in strip_train_text:
    stemmed_train_text.add(stemmer.stem(t))
stemmed_train_text = sorted(list(stemmed_train_text))
stemmed_train_text

['',
 'absolut',
 'amaz',
 'anyway',
 'arriv',
 'back',
 'best',
 'better',
 'birthday',
 'blend',
 'bloodi',
 'bread',
 'breakfast',
 'busi',
 'came',
 'complet',
 'delici',
 'earlier',
 'egg',
 'ever',
 'everyth',
 'excel',
 'favor',
 'fill',
 'food',
 'fresh',
 'garden',
 'get',
 'go',
 'griddl',
 'ground',
 'ingredi',
 'like',
 'look',
 'made',
 'mari',
 'meal',
 'menu',
 'morn',
 'order',
 'outsid',
 'overlook',
 'perfect',
 'phenomen',
 'piec',
 'place',
 'pleasur',
 'pretti',
 'quickli',
 'saturday',
 'scrambl',
 'semi',
 'simpli',
 'sit',
 'skillet',
 'sure',
 'tasti',
 'toast',
 'took',
 'truffl',
 'use',
 'veget',
 'wait',
 'waitress',
 'weather',
 'white',
 'wife']

In [29]:
lemma_train_text = set()
for t in strip_train_text:
    lemma_train_text.add(lemmatizer.lemmatize(t))
lemma_train_text = sorted(list(lemma_train_text))
lemma_train_text

['',
 'absolute',
 'absolutely',
 'amazing',
 'anyway',
 'arrived',
 'back',
 'best',
 'better',
 'birthday',
 'blend',
 'bloody',
 'bread',
 'breakfast',
 'busy',
 'came',
 'complete',
 'delicious',
 'earlier',
 'egg',
 'ever',
 'everything',
 'excellent',
 'favor',
 'fill',
 'food',
 'fresh',
 'garden',
 'get',
 'go',
 'griddled',
 'ground',
 'ingredient',
 'like',
 'look',
 'looked',
 'made',
 'mary',
 'meal',
 'menu',
 'morning',
 'order',
 'outside',
 'overlooking',
 'perfect',
 'phenomenal',
 'piece',
 'place',
 'pleasure',
 'pretty',
 'quickly',
 'saturday',
 'scrambled',
 'semi',
 'simply',
 'sitting',
 'skillet',
 'sure',
 'tasty',
 'toast',
 'took',
 'truffle',
 'use',
 'vegetable',
 'wait',
 'waitress',
 'weather',
 'white',
 'wife']

## **Question 2.** Based on the examples and the output of your code, which one has the better performance, Stemming or Lemmatization? Try to analyze it.

In [30]:
len(stemmed_train_text)

67

In [31]:
len(lemma_train_text)

69

In [32]:
print("stem\tlemma")
for i, t in enumerate(lemma_train_text):
    s = ""
    if i < len(stemmed_train_text):
        s = stemmed_train_text[i]
    print("%s\t%s"%(s, t))

stem	lemma
	
absolut	absolute
amaz	absolutely
anyway	amazing
arriv	anyway
back	arrived
best	back
better	best
birthday	better
blend	birthday
bloodi	blend
bread	bloody
breakfast	bread
busi	breakfast
came	busy
complet	came
delici	complete
earlier	delicious
egg	earlier
ever	egg
everyth	ever
excel	everything
favor	excellent
fill	favor
food	fill
fresh	food
garden	fresh
get	garden
go	get
griddl	go
ground	griddled
ingredi	ground
like	ingredient
look	like
made	look
mari	looked
meal	made
menu	mary
morn	meal
order	menu
outsid	morning
overlook	order
perfect	outside
phenomen	overlooking
piec	perfect
place	phenomenal
pleasur	piece
pretti	place
quickli	pleasure
saturday	pretty
scrambl	quickly
semi	saturday
simpli	scrambled
sit	semi
skillet	simply
sure	sitting
tasti	skillet
toast	sure
took	tasty
truffl	toast
use	took
veget	truffle
wait	use
waitress	vegetable
weather	wait
white	waitress
wife	weather
	white
	wife


**Answer to Q2**: Lemmatizationkeeps more words complete and unmangled. Stemming, on the other hand, produces fewer words that tend to be shorter, but many cut short or horrendously mispelled. I would choose lemmatization over stemming, as it retains more of the informaiton that was in the original tokens.

---

## (Tutorial) PoS tagging and chunking

**note:** you need to install spacy and textblob modules first for the following codes
If you have problem to install spacy module, try to follow the instruction in the following link:
https://stackoverflow.com/questions/66149878/e053-could-not-read-config-cfg-resumeparser
If you have problem to use textblob module, try to install nltk libraries as shown in the following link:
http://www.nltk.org/data.html

In [33]:
# Load the first 10 reviews
f = open('yelp_academic_dataset_review.json')
js = []
for i in range(10):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)
review_df.shape

(10, 8)

In [34]:
# chunking in spaCy
spacy.info('en_core_web_sm')

{'lang': 'en',
 'name': 'core_web_sm',
 'version': '3.4.0',
 'description': 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.',
 'author': 'Explosion',
 'email': 'contact@explosion.ai',
 'url': 'https://explosion.ai',
 'license': 'MIT',
 'spacy_version': '>=3.4.0,<3.5.0',
 'spacy_git_version': 'dd038b536',
 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None},
 'labels': {'tok2vec': [],
  'tagger': ['$',
   "''",
   ',',
   '-LRB-',
   '-RRB-',
   '.',
   ':',
   'ADD',
   'AFX',
   'CC',
   'CD',
   'DT',
   'EX',
   'FW',
   'HYPH',
   'IN',
   'JJ',
   'JJR',
   'JJS',
   'LS',
   'MD',
   'NFP',
   'NN',
   'NNP',
   'NNPS',
   'NNS',
   'PDT',
   'POS',
   'PRP',
   'PRP$',
   'RB',
   'RBR',
   'RBS',
   'RP',
   'SYM',
   'TO',
   'UH',
   'VB',
   'VBD',
   'VBG',
   'VBN',
   'VBP',
   'VBZ',
   'WDT',
   'WP',
   'WP$',
   'WRB',
   'XX',
   '_SP',
   '``'],
  'parser': ['ROOT',
   'acl',
   'acomp',


In [35]:
nlp = spacy.load("en_core_web_sm")
doc_df = review_df['text'].apply(nlp)
type(doc_df)

pandas.core.series.Series

In [36]:
type(doc_df[0])

spacy.tokens.doc.Doc

In [37]:
doc_df[4]

General Manager Scott Petello is a good egg!!! Not to go into detail, but let me assure you if you have any issues (albeit rare) speak with Scott and treat the guy with some respect as you state your case and I'd be surprised if you don't walk out totally satisfied as I just did. Like I always say..... "Mistakes are inevitable, it's how we recover from them that is important"!!!

Thanks to Scott and his awesome staff. You've got a customer for life!! .......... :^)

In [38]:
for doc in doc_df[4]:
    print(doc.text, doc.pos_, doc.tag_)

General PROPN NNP
Manager PROPN NNP
Scott PROPN NNP
Petello PROPN NNP
is AUX VBZ
a DET DT
good ADJ JJ
egg NOUN NN
! PUNCT .
! PUNCT .
! PUNCT .
Not PART RB
to PART TO
go VERB VB
into ADP IN
detail NOUN NN
, PUNCT ,
but CCONJ CC
let VERB VB
me PRON PRP
assure VERB VB
you PRON PRP
if SCONJ IN
you PRON PRP
have VERB VBP
any DET DT
issues NOUN NNS
( PUNCT -LRB-
albeit SCONJ IN
rare ADJ JJ
) PUNCT -RRB-
speak VERB VBP
with ADP IN
Scott PROPN NNP
and CCONJ CC
treat VERB VB
the DET DT
guy NOUN NN
with ADP IN
some DET DT
respect NOUN NN
as SCONJ IN
you PRON PRP
state VERB VBP
your PRON PRP$
case NOUN NN
and CCONJ CC
I PRON PRP
'd AUX MD
be AUX VB
surprised ADJ JJ
if SCONJ IN
you PRON PRP
do AUX VBP
n't PART RB
walk VERB VB
out ADP RP
totally ADV RB
satisfied ADJ JJ
as SCONJ IN
I PRON PRP
just ADV RB
did VERB VBD
. PUNCT .
Like INTJ UH
I PRON PRP
always ADV RB
say VERB VBP
..... PUNCT .
" PUNCT ``
Mistakes NOUN NNS
are AUX VBP
inevitable ADJ JJ
, PUNCT ,
it PRON PRP
's AUX VBZ
how SCONJ WRB
we 

In [39]:
# spaCy also does some basic noun chunking
print([chunk for chunk in doc_df[4].noun_chunks])

[General Manager Scott Petello, a good egg, detail, me, you, you, any issues, Scott, the guy, some respect, you, your case, I, you, I, I, Mistakes, it, we, them, that, Thanks, Scott, his awesome staff, You, a customer, life]


In [40]:
# chunking in textblob
blob_df = review_df['text'].apply(TextBlob)
type(blob_df)

pandas.core.series.Series

In [41]:
type(blob_df[4])

textblob.blob.TextBlob

In [42]:
blob_df[4].tags

[('General', 'NNP'),
 ('Manager', 'NNP'),
 ('Scott', 'NNP'),
 ('Petello', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('egg', 'NN'),
 ('Not', 'RB'),
 ('to', 'TO'),
 ('go', 'VB'),
 ('into', 'IN'),
 ('detail', 'NN'),
 ('but', 'CC'),
 ('let', 'VB'),
 ('me', 'PRP'),
 ('assure', 'VB'),
 ('you', 'PRP'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('have', 'VBP'),
 ('any', 'DT'),
 ('issues', 'NNS'),
 ('albeit', 'IN'),
 ('rare', 'NN'),
 ('speak', 'NN'),
 ('with', 'IN'),
 ('Scott', 'NNP'),
 ('and', 'CC'),
 ('treat', 'VB'),
 ('the', 'DT'),
 ('guy', 'NN'),
 ('with', 'IN'),
 ('some', 'DT'),
 ('respect', 'NN'),
 ('as', 'IN'),
 ('you', 'PRP'),
 ('state', 'NN'),
 ('your', 'PRP$'),
 ('case', 'NN'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ("'d", 'MD'),
 ('be', 'VB'),
 ('surprised', 'VBN'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('walk', 'VB'),
 ('out', 'RP'),
 ('totally', 'RB'),
 ('satisfied', 'JJ'),
 ('as', 'IN'),
 ('I', 'PRP'),
 ('just', 'RB'),
 ('did', 'VBD'),
 ('Like', 'IN'),
 ('

In [43]:
# textblob can do some basic noun chunking 
print([np for np in blob_df[4].noun_phrases])

['general manager', 'scott petello', 'good egg', 'scott', "n't walk", 'mistakes', 'thanks', 'scott', 'awesome staff']


## Task 3. Apply spacy and textblob chunking to the text used in tesk 1 respectively, and output the noun phrase chunking results

In [44]:
# Spacy ------------------------------------------------------------------------
print("Spacy: ----------------------------------------------------------------------------")
nlp_train_text = nlp(train_text)

# POS
disp = []
for tok in nlp_train_text:
    disp.append([tok.text, tok.pos_, tok.tag_])
print(tabulate(disp, headers=["Text", "POS", "POS Tag"]))

# Chunking
print([chunk for chunk in nlp_train_text.noun_chunks])

Spacy: ----------------------------------------------------------------------------
Text         POS    POS Tag
-----------  -----  ---------
My           PRON   PRP$
wife         NOUN   NN
took         VERB   VBD
me           PRON   PRP
here         ADV    RB
on           ADP    IN
my           PRON   PRP$
birthday     NOUN   NN
for          ADP    IN
breakfast    NOUN   NN
and          CCONJ  CC
it           PRON   PRP
was          AUX    VBD
excellent    ADJ    JJ
.            PUNCT  .
             SPACE  _SP
The          DET    DT
weather      NOUN   NN
was          AUX    VBD
perfect      ADJ    JJ
which        PRON   WDT
made         VERB   VBD
sitting      VERB   VBG
outside      ADV    RB
overlooking  VERB   VBG
their        PRON   PRP$
grounds      NOUN   NNS
an           DET    DT
absolute     ADJ    JJ
pleasure     NOUN   NN
.            PUNCT  .
             SPACE  _SP
Our          PRON   PRP$
waitress     NOUN   NN
was          AUX    VBD
excellent    ADJ    JJ
and        

In [45]:
# TextBlob ---------------------------------------------------------------------
print("\TextBlob: -------------------------------------------------------------------------")
blob_train_text = TextBlob(train_text)
for t in blob_train_text.tags:
    print(t)

print([np for np in blob_train_text.noun_phrases])

\TextBlob: -------------------------------------------------------------------------
('My', 'PRP$')
('wife', 'NN')
('took', 'VBD')
('me', 'PRP')
('here', 'RB')
('on', 'IN')
('my', 'PRP$')
('birthday', 'NN')
('for', 'IN')
('breakfast', 'NN')
('and', 'CC')
('it', 'PRP')
('was', 'VBD')
('excellent', 'JJ')
('The', 'DT')
('weather', 'NN')
('was', 'VBD')
('perfect', 'JJ')
('which', 'WDT')
('made', 'VBD')
('sitting', 'VBG')
('outside', 'IN')
('overlooking', 'VBG')
('their', 'PRP$')
('grounds', 'NNS')
('an', 'DT')
('absolute', 'JJ')
('pleasure', 'NN')
('Our', 'PRP$')
('waitress', 'NN')
('was', 'VBD')
('excellent', 'JJ')
('and', 'CC')
('our', 'PRP$')
('food', 'NN')
('arrived', 'VBD')
('quickly', 'RB')
('on', 'IN')
('the', 'DT')
('semi-busy', 'JJ')
('Saturday', 'NNP')
('morning', 'NN')
('It', 'PRP')
('looked', 'VBD')
('like', 'IN')
('the', 'DT')
('place', 'NN')
('fills', 'VBZ')
('up', 'RP')
('pretty', 'RB')
('quickly', 'RB')
('so', 'IN')
('the', 'DT')
('earlier', 'JJR')
('you', 'PRP')
('get', 'V

## **Question 3**. Comparing the outputs of spacy and textblob chunking in tast 3, which one would you like to use in your application? Explain it.

**Answer to Q3**: I would prefer to use spacy. Not only does it have more extensive tagging, for chunking it produces a larger list of potential chunks that cover more of the text. Textblob's chunking algorithm produces a paltry list of chunks that don't cover a huge portion of the original text. A lot of information is being missed by Textblob that spacy is picking up.

---

## Question 4. Whats the disadvantage in bag of words  . Please explain in your own words with an example .

## Write code for the example.

### Answer to Q4: A major disadvantage is that the order of the words is destroyed. Since a great deal of language's meaning come from word order, bag of words automatically destroys a huge portion of the data contained within the text. After bag of words is applied, it is impossible to do any nlp tasks involving more than one token.

In [46]:
# Build basic bow
counts = {}
for tok in lemma_train_text:
    if tok not in counts.keys():
        counts[tok] = 1
    else:
        counts[tok] += 1

In [47]:
nlp_train_text = nlp(train_text)

# POS
disp = []
for tok in counts:

    # POS - fails
    t = nlp(tok)
    print(t.text)
    # print(t.pos_) # Throws errors, uncomment to see
    # print(t.tag_) # Throws errors, uncomment to see

    # Chunking - doesn't crash but output is useless if it isn't empty
    print([chunk for chunk in t.noun_chunks])
    print()


[]

absolute
[]

absolutely
[]

amazing
[]

anyway
[]

arrived
[]

back
[]

best
[]

better
[]

birthday
[birthday]

blend
[]

bloody
[]

bread
[bread]

breakfast
[breakfast]

busy
[]

came
[]

complete
[]

delicious
[delicious]

earlier
[]

egg
[egg]

ever
[]

everything
[everything]

excellent
[]

favor
[]

fill
[]

food
[food]

fresh
[]

garden
[garden]

get
[]

go
[]

griddled
[]

ground
[ground]

ingredient
[ingredient]

like
[]

look
[]

looked
[]

made
[]

mary
[mary]

meal
[meal]

menu
[]

morning
[morning]

order
[order]

outside
[]

overlooking
[]

perfect
[]

phenomenal
[]

piece
[piece]

place
[place]

pleasure
[pleasure]

pretty
[]

quickly
[]

saturday
[saturday]

scrambled
[]

semi
[]

simply
[]

sitting
[]

skillet
[]

sure
[]

tasty
[]

toast
[toast]

took
[]

truffle
[]

use
[]

vegetable
[vegetable]

wait
[]

waitress
[]

weather
[weather]

white
[]

wife
[wife]

