Preparation

In [1]:
import nltk

nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [2]:
from nltk.corpus import twitter_samples

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

positive_tweets[50]

'@groovinshawn they are rechargeable and it normally comes with a charger when u buy it :)'

In [4]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(tweet_tokens[50])

['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']


In [5]:
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tag import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [6]:
pos_tag(tweet_tokens[50])

[('@groovinshawn', 'NN'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('rechargeable', 'JJ'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('normally', 'RB'),
 ('comes', 'VBZ'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('charger', 'NN'),
 ('when', 'WRB'),
 ('u', 'JJ'),
 ('buy', 'VB'),
 ('it', 'PRP'),
 (':)', 'JJ')]

In [7]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
from nltk.corpus import wordnet as wn

word_synset = wn.synsets("car")
print("synsets:", word_synset)
print("lemma names:", word_synset[0].lemma_names())

synsets: [Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]
lemma names: ['car', 'auto', 'automobile', 'machine', 'motorcar']


In [9]:
print(word_synset[0].definition())
print(word_synset[0].examples())
print(word_synset[1].definition())
print(word_synset[1].examples())

a motor vehicle with four wheels; usually propelled by an internal combustion engine
['he needs a car to get to work']
a wheeled vehicle adapted to the rails of railroad
['three cars had jumped the rails']


In [10]:
word_synset[0].hyponyms()

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [11]:
word_synset[0].hypernyms()

[Synset('motor_vehicle.n.01')]

In [12]:
tree = wn.synsets("tree")[0]
paths = tree.hypernym_paths()
for p in paths:
  print([synset.name() for synset in p])

['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'living_thing.n.01', 'organism.n.01', 'plant.n.02', 'vascular_plant.n.01', 'woody_plant.n.01', 'tree.n.01']


In [13]:
tree.part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

In [14]:
tree.member_holonyms()

[Synset('forest.n.01')]

In [15]:
from nltk.stem.wordnet import WordNetLemmatizer
tokens = tweet_tokens[50]

In [16]:
# Create a lemmatizer
lemmatizer = WordNetLemmatizer()

In [21]:
from NLP.Lab3.helper.lemmatization.lemmatization_adapter import NLTKLemmatizationAdatper, LemmatizationAdatper

In [22]:
# adapt lemmatizer to lemmatization Interface
lemmatizer = NLTKLemmatizationAdatper(lemmatizer)

In [74]:
from typing import Iterable
from NLP.Lab3.logger import logging

def lemmatize_sentence(tokens: Iterable, 
                       pos: str = 'v',
                       lemmatizer: LemmatizationAdatper = lemmatizer,
                       max_cache_size: int = 10000):
  lemmatized_words = {}
  
  try:
    for token in tokens:
      if token in lemmatized_words:
        yield token
        
      else:
        if len(lemmatized_words) < max_cache_size:
          lemmatized_words[token] = lemmatizer.lemmatize(token)
          yield lemmatized_words[token]
        else:
          yield lemmatizer.lemmatize(token)
      
  except Exception as e:
    logging.exception(e)

list(lemmatize_sentence(tokens))

['@groovinshawn',
 'they',
 'are',
 'rechargeable',
 'and',
 'it',
 'normally',
 'come',
 'with',
 'a',
 'charger',
 'when',
 'u',
 'buy',
 'it',
 ':)']

In [75]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [76]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(len(stop_words))
for i in range(10):
    print(stop_words[i])

198
a
about
above
after
again
against
ain
all
am
an


In [None]:
import re, string

def process_tokens(tweet_tokens: Iterable,
                   stop_words: set = set(stop_words),
                   invalid_symbols: set = set(string.punctuation),
                   invalid_pattern: str = re'
                   lemmatizer: LemmatizationAdatper = lemmatizer,
                   ):

    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    for token, tag in pos_tag(tweet_tokens):
      # CODE_START
      # ...
      # CODE_END
    return cleaned_tokens