Author: Abel Stanley

NIM: 13517068

### 1. Tokenization

#### a. NLTK

In [6]:
# NLTK
import nltk
sentence = "I love you, you love me, we're happy fat maw leek. With a big-big U.K. buck."
tokens = nltk.word_tokenize(sentence)
print(tokens)

['I', 'love', 'you', ',', 'you', 'love', 'me', ',', 'we', "'re", 'happy', 'fat', 'maw', 'leek', '.', 'With', 'a', 'big-big', 'U.K.', 'buck', '.']


#### b. spaCy

In [7]:
# spaCy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

tokens = tokenizer(sentence)
print(list(tokens))

[I, love, you,, you, love, me,, we're, happy, fat, maw, leek., With, a, big-big, U.K., buck.]


### 2. Lemmatization

#### a. NLTK

In [19]:
# NLTK
# Setups:
import nltk
# nltk.download('wordnet') # download first!
sentence = u"Apples and oranges are similar. Boots and hippos aren't."
tokens = nltk.word_tokenize(sentence)
lemmatizer = nltk.stem.WordNetLemmatizer()
for word in tokens:
    print(word, ">>", lemmatizer.lemmatize(word.lower()))
    
# NOTE:
# for nltk Lemmatization to work properly, the word has to be in lowercase!

Apples >> apple
and >> and
oranges >> orange
are >> are
similar >> similar
. >> .
Boots >> boot
and >> and
hippos >> hippo
are >> are
n't >> n't
. >> .


#### b. spaCy

In [10]:
# spaCy
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp(sentence)

for token in doc:
    print("Token:", token, "| Lemma:", token.lemma, "| Lemma_:", token.lemma_)

Token: Apples | Lemma: 8566208034543834098 | Lemma_: apple
Token: and | Lemma: 2283656566040971221 | Lemma_: and
Token: oranges | Lemma: 2208928596161743350 | Lemma_: orange
Token: are | Lemma: 10382539506755952630 | Lemma_: be
Token: similar | Lemma: 18166476740537071113 | Lemma_: similar
Token: . | Lemma: 12646065887601541794 | Lemma_: .
Token: Boots | Lemma: 9918665227421442029 | Lemma_: boot
Token: and | Lemma: 2283656566040971221 | Lemma_: and
Token: hippos | Lemma: 4133693291145879083 | Lemma_: hippos
Token: are | Lemma: 10382539506755952630 | Lemma_: be
Token: n't | Lemma: 447765159362469301 | Lemma_: not
Token: . | Lemma: 12646065887601541794 | Lemma_: .


### 3. NER

#### a. NLTK

In [25]:
# NLTK
# Setups:
import nltk
# DOWNLOAD:
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

sentence = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."
for sent in nltk.sent_tokenize(sentence):
   for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
      if hasattr(chunk, 'label'):
         print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE WASHINGTON
GPE New York
PERSON Loretta E. Lynch
GPE Brooklyn


#### b. spaCy

In [27]:
# spaCy
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp(sentence)

for ent in doc.ents:
    print("Text:", ent.text, "| Start:", ent.start_char, "| End:", ent.end_char, "| Label:", ent.label_)

Text: WASHINGTON | Start: 0 | End: 10 | Label: GPE
Text: New York | Start: 51 | End: 59 | Label: GPE
Text: the 1990s | Start: 79 | End: 88 | Label: DATE
Text: Loretta E. Lynch | Start: 90 | End: 106 | Label: PERSON
Text: Brooklyn | Start: 138 | End: 146 | Label: GPE
Text: African-Americans | Start: 203 | End: 220 | Label: NORP


### 4. POS Tagger

#### a. NLTK

In [66]:
# NLTK
# Setups:
import nltk
# DOWNLOAD:
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

sentence = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."
text = nltk.word_tokenize(sentence)
nltk.pos_tag(text)

[('WASHINGTON', 'NNP'),
 ('--', ':'),
 ('In', 'IN'),
 ('the', 'DT'),
 ('wake', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('string', 'NN'),
 ('of', 'IN'),
 ('abuses', 'NNS'),
 ('by', 'IN'),
 ('New', 'NNP'),
 ('York', 'NNP'),
 ('police', 'NN'),
 ('officers', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('1990s', 'CD'),
 (',', ','),
 ('Loretta', 'NNP'),
 ('E.', 'NNP'),
 ('Lynch', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('top', 'JJ'),
 ('federal', 'JJ'),
 ('prosecutor', 'NN'),
 ('in', 'IN'),
 ('Brooklyn', 'NNP'),
 (',', ','),
 ('spoke', 'VBD'),
 ('forcefully', 'RB'),
 ('about', 'IN'),
 ('the', 'DT'),
 ('pain', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('broken', 'JJ'),
 ('trust', 'NN'),
 ('that', 'IN'),
 ('African-Americans', 'NNP'),
 ('felt', 'VBD'),
 ('and', 'CC'),
 ('said', 'VBD'),
 ('the', 'DT'),
 ('responsibility', 'NN'),
 ('for', 'IN'),
 ('repairing', 'VBG'),
 ('generations', 'NNS'),
 ('of', 'IN'),
 ('miscommunication', 'NN'),
 ('and', 'CC'),
 ('mistrust', 'NN'),
 ('fell', 'VBD'),
 ('to', 'TO'),
 ('l

#### b. spaCy

In [64]:
# spaCy
import spacy
nlp = spacy.load('en_core_web_sm')

sentence = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."
doc = nlp(sentence)

pos_tagging_result = [token.tag_ for token in doc]
print(pos_tagging_result)

['NNP', ':', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'NNS', 'IN', 'NNP', 'NNP', 'NN', 'NNS', 'IN', 'DT', 'NNS', ',', 'NNP', 'NNP', 'NNP', ',', 'DT', 'JJ', 'JJ', 'NN', 'IN', 'NNP', ',', 'VBD', 'RB', 'IN', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'IN', 'NNP', 'HYPH', 'NNPS', 'VBD', 'CC', 'VBD', 'DT', 'NN', 'IN', 'VBG', 'NNS', 'IN', 'NN', 'CC', 'NN', 'VBD', 'IN', 'NN', 'NN', '.']


### 5. Parser

#### a. NLTK (Grammar Parser + Chart Parsing)

In [85]:
grammar = nltk.CFG.fromstring("""
S -> V NP | S CONJ S
V -> 'describe' | 'present'
NP -> PRP N | DT N PP | DT N | ADJ N PP | DT NP
PRP -> 'your' 
N -> 'work' | 'step' | 'results' | 'Word_Document'
PP -> P NP
P -> 'of' | 'in'
DT -> 'every' | 'a' | 'all'
ADJ -> 'final' | 'intermediate' | ADJ CONJ ADJ
CONJ -> 'and'
""")

grammar.start()


S

In [86]:
grammar.productions()

[S -> V NP,
 S -> S CONJ S,
 V -> 'describe',
 V -> 'present',
 NP -> PRP N,
 NP -> DT N PP,
 NP -> DT N,
 NP -> ADJ N PP,
 NP -> DT NP,
 PRP -> 'your',
 N -> 'work',
 N -> 'step',
 N -> 'results',
 N -> 'Word_Document',
 PP -> P NP,
 P -> 'of',
 P -> 'in',
 DT -> 'every',
 DT -> 'a',
 DT -> 'all',
 ADJ -> 'final',
 ADJ -> 'intermediate',
 ADJ -> ADJ CONJ ADJ,
 CONJ -> 'and']

In [89]:
print('Chart Parsing in NLTK')
sentence = 'present all intermediate and final results in a Word_Document'.split()
parser = nltk.ChartParser(grammar)
trees = list(parser.parse(sentence))

print(trees)

Chart Parsing in NLTK
[Tree('S', [Tree('V', ['present']), Tree('NP', [Tree('DT', ['all']), Tree('NP', [Tree('ADJ', [Tree('ADJ', ['intermediate']), Tree('CONJ', ['and']), Tree('ADJ', ['final'])]), Tree('N', ['results']), Tree('PP', [Tree('P', ['in']), Tree('NP', [Tree('DT', ['a']), Tree('N', ['Word_Document'])])])])])])]


#### b. spaCy (Chunking)

In [59]:
# spaCy
import spacy
nlp = spacy.load('en_core_web_sm')

sentence = "I love you, you love me, we're happy fat maw leek. With a big-big U.K. buck."
doc = nlp(sentence)
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

I I nsubj love
you you dobj love
you you nsubj love
me me dobj love
we we nsubj 're
happy fat maw leek leek attr 're
a big-big U.K. buck buck pobj With


### 6. Word Sense Disambiguation

#### a. NLTK

In [29]:
# NLTK
print("definitions of 'bank':")
for ss in nltk.corpus.wordnet.synsets('bank'):
    print(ss, ss.definition())

definitions of 'bank':
Synset('bank.n.01') sloping land (especially the slope beside a body of water)
Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities
Synset('bank.n.03') a long ridge or pile
Synset('bank.n.04') an arrangement of similar objects in a row or in tiers
Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies)
Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games
Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home
Synset('bank.n.09') a building in which the business of banking transacted
Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
Synset('bank.

In [28]:
sentence = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']
# disambiguate the word "bank"
print(nltk.wsd.lesk(sentence, 'bank'))

Synset('savings_bank.n.02')


#### b. spaCy

In [62]:
# spaCy
# NOT SUPPORTED
print("spaCy does not have direct support for Word Sense Disambiguation")

spaCy does not have direct support for Word Sense Disambiguation


### 7. Sentiment Analysis

#### a. NLTK

In [34]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
# DOWNLOADS:
# nltk.download('subjectivity')

n_instances = 100
subj_docs = [(sentence, 'subj') for sentence in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sentence, 'obj') for sentence in subjectivity.sents(categories='obj')[:n_instances]]
print(len(subj_docs), len(obj_docs))

100 100


In [35]:
print("Example docs content:")
print(subj_docs[0])

Example docs content:
(['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj')


In [36]:
print("Separately split subjective and objective instances to keep a balanced uniform class distribution in both train and test sets.")
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

Training model:


In [92]:
#  simple unigram word features, handling negation:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [93]:
# apply features to obtain a feature-value representation of our datasets:
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)

In [39]:
# train classifier on the training set, and subsequently output the evaluation results:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

# train classifier:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8


#### b. spaCy

In [42]:
import re
from spacy.tokens import Token

In [45]:
from enum import Enum

class Topic(Enum):
    AMBIENCE = 1
    FOOD = 2
    HYGIENE = 3
    SERVICE = 4
    VALUE = 5
    
class Rating(Enum):
    VERY_BAD = -3
    BAD = -2
    SOMEWHAT_BAD = -1
    SOMEWHAT_GOOD = 1
    GOOD = 2
    VERY_GOOD = 3

#### Lexicon Entry
- mostly a data container
- but we also want to be able to compare if it matches a spaCy Token --> we need a matching() function.
- tokens can match exactly or only after transformations (for example upper/lower case) --> score between 0 (no match) and 1 (perfect match)


In [48]:

class LexiconEntry:
    _IS_REGEX_REGEX = re.compile(r'.*[.+*\[$^\\]')

    def __init__(self, lemma: str, topic: Topic, rating: Rating):
        assert lemma is not None
        self.lemma = lemma
        self._lower_lemma = lemma.lower()
        self.topic = topic
        self.rating = rating
        self.is_regex = bool(LexiconEntry._IS_REGEX_REGEX.match(self.lemma))
        self._regex = re.compile(lemma, re.IGNORECASE) if self.is_regex else None

    def matching(self, token: Token) -> float:
        """
        A weight between 0.0 and 1.0 on how much ``token`` matches this entry.
        """
        assert token is not None
        result = 0.0
        if self.is_regex:
            if self._regex.match(token.text):
                result = 0.6
            elif self._regex.match(token.lemma_):
                result = 0.5
        else:
            if token.text == self.lemma:
                result = 1.0
            elif token.text.lower() == self.lemma:
                result = 0.9
            elif token.lemma_ == self.lemma:
                result = 0.8
            elif token.lemma_.lower() == self.lemma:
                result = 0.7
        return result

    def __str__(self) -> str:
        result = 'LexiconEntry(%s' % self.lemma
        if self.topic is not None:
            result += ', topic=%s' % self.topic.name
        if self.rating is not None:
            result += ', rating=%s' % self.rating.name
        if self.is_regex:
            result += ', is_regex=%s' % self.is_regex
        result += ')'
        return result

    def __repr__(self) -> str:
        return self.__str__()

#### The lexicon in Python
- Contains a list of LexiconEntry
- Can find the best matching entry for a Token (or None)
- In the beginning entries have to be added
- usually data comes from e.g .CSV (NOT MANUALLY INSERTED LIKE NOW)

from math import isclose

class Lexicon:
    def __init__(self):
        self.entries: List[LexiconEntry] = []

    
    def append(self, lemma: str, topic: Topic, rating: Rating):
        lexicon_entry = LexiconEntry(lemma, topic, rating)
        self.entries.append(lexicon_entry)

    def lexicon_entry_for(self, token: Token) -> LexiconEntry:
        """
        Entry in lexicon that best matches ``token``.
        """
        result = None
        lexicon_size = len(self.entries)
        lexicon_entry_index = 0
        best_matching = 0.0
        while lexicon_entry_index < lexicon_size and not isclose(best_matching, 1.0):
            lexicon_entry = self.entries[lexicon_entry_index]
            matching = lexicon_entry.matching(token)
            if matching > best_matching:
                result = lexicon_entry
                best_matching = matching
            lexicon_entry_index += 1
        return result

#### build a small lexicon

In [55]:

lexicon = Lexicon()
lexicon.append('waiter'     , Topic.SERVICE , None)
lexicon.append('waitress'   , Topic.SERVICE , None)
lexicon.append('wait'       , None          , Rating.BAD)
lexicon.append('quick'      , None          , Rating.GOOD)
lexicon.append('.*schnitzel', Topic.FOOD    , None)
lexicon.append('music'      , Topic.AMBIENCE, None)
lexicon.append('loud'       , None          , Rating.BAD)
lexicon.append('tasty'      , Topic.FOOD    , Rating.GOOD)
lexicon.append('polite'     , Topic.SERVICE , Rating.GOOD)

#### Matching tokens in a sentence to a lexicon entry

In [52]:
feedback_text = 'The music was very loud.'
feedback = nlp(feedback_text)
for token in next(feedback.sents):
    lexicon_entry = lexicon.lexicon_entry_for(token)
    print(f'{token!s:10} {lexicon_entry}')

The        None
music      LexiconEntry(music, topic=AMBIENCE)
was        None
very       None
loud       LexiconEntry(loud, rating=BAD)
.          None


#### Simple Sentiment Analysis: Add filters and format to ouput

In [54]:
feedback_text = 'The music was very loud.'
feedback = nlp(feedback_text)
for sent in feedback.sents:
    print(sent)
    for token in sent:
        lexicon_entry = lexicon.lexicon_entry_for(token)
        if lexicon_entry is not None:
            if lexicon_entry.topic is not None:
                print('    ', lexicon_entry.topic)
            if lexicon_entry.rating is not None:
                print('    ', lexicon_entry.rating)

The music was very loud.
     Topic.AMBIENCE
     Rating.BAD


### 8. Sentence Splitter

#### a. NLTK

In [91]:
text = "I love you, you love me, we're happy fat maw leek. With a big-big U.K. buck."
print(nltk.tokenize.sent_tokenize(text))


["I love you, you love me, we're happy fat maw leek.", 'With a big-big U.K. buck.']


NOTE: as you can see, NLTK succeeds to differentiate dots in initials like (U.K.) from full-stop dots.

#### b. spaCy

In [61]:
# spaCy
import spacy
nlp = spacy.load('en_core_web_sm')

sentence = "I love you, you love me, we're happy fat maw leek. With a big-big U.K. buck."
doc = nlp(sentence)
    
splitted_result = list(doc.sents)
print(splitted_result)

[I love you, you love me, we're happy fat maw leek., With a big-big U.K. buck.]


NOTE: as you can see, spaCy fails to differentiate dots in initials like (U.K.) from full-stop dots.