Preparation

In [None]:
import nltk

nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [191]:
from nltk.corpus import twitter_samples

In [192]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

positive_tweets[50]

'@groovinshawn they are rechargeable and it normally comes with a charger when u buy it :)'

In [193]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(tweet_tokens[50])

['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']


In [194]:
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tag import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [195]:
pos_tag(tweet_tokens[50])

[('@groovinshawn', 'NN'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('rechargeable', 'JJ'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('normally', 'RB'),
 ('comes', 'VBZ'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('charger', 'NN'),
 ('when', 'WRB'),
 ('u', 'JJ'),
 ('buy', 'VB'),
 ('it', 'PRP'),
 (':)', 'JJ')]

In [196]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [197]:
from nltk.corpus import wordnet as wn

word_synset = wn.synsets("car")
print("synsets:", word_synset)
print("lemma names:", word_synset[0].lemma_names())

synsets: [Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]
lemma names: ['car', 'auto', 'automobile', 'machine', 'motorcar']


In [198]:
print(word_synset[0].definition())
print(word_synset[0].examples())
print(word_synset[1].definition())
print(word_synset[1].examples())

a motor vehicle with four wheels; usually propelled by an internal combustion engine
['he needs a car to get to work']
a wheeled vehicle adapted to the rails of railroad
['three cars had jumped the rails']


In [199]:
word_synset[0].hyponyms()

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [200]:
word_synset[0].hypernyms()

[Synset('motor_vehicle.n.01')]

In [201]:
tree = wn.synsets("tree")[0]
paths = tree.hypernym_paths()
for p in paths:
  print([synset.name() for synset in p])

['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'living_thing.n.01', 'organism.n.01', 'plant.n.02', 'vascular_plant.n.01', 'woody_plant.n.01', 'tree.n.01']


In [202]:
tree.part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

In [203]:
tree.member_holonyms()

[Synset('forest.n.01')]

In [204]:
from nltk.stem.wordnet import WordNetLemmatizer
tokens = tweet_tokens[50]

In [205]:
# Create a lemmatizer
lemmatizer = WordNetLemmatizer()

In [206]:
from NLP.Lab3.helper.lemmatization.lemmatization_adapter import NLTKLemmatizationAdatper, LemmatizationAdatper

In [207]:
# adapt lemmatizer to lemmatization Interface
lemmatizer = NLTKLemmatizationAdatper(lemmatizer)

In [208]:
from typing import Iterable
from NLP.Lab3.logger import logging

def lemmatize_sentence(tokens: Iterable, 
                       pos: str = 'v',
                       lemmatizer: LemmatizationAdatper = lemmatizer,
                       max_cache_size: int = 10000):
  lemmatized_words = {}
  
  try:
    for token in tokens:
      if token in lemmatized_words:
        yield token
        
      else:
        if len(lemmatized_words) < max_cache_size:
          lemmatized_words[token] = lemmatizer.lemmatize(token)
          yield lemmatized_words[token]
        else:
          yield lemmatizer.lemmatize(token)
      
  except Exception as e:
    logging.exception(e)

list(lemmatize_sentence(tokens))

['@groovinshawn',
 'they',
 'are',
 'rechargeable',
 'and',
 'it',
 'normally',
 'come',
 'with',
 'a',
 'charger',
 'when',
 'u',
 'buy',
 'it',
 ':)']

In [209]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [210]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(len(stop_words))
for i in range(10):
    print(stop_words[i])

198
a
about
above
after
again
against
ain
all
am
an


In [211]:
import re, string

def process_tokens(tweet_tokens: Iterable,
                   stop_words: set = set(stop_words),
                   invalid_symbols: set = set(string.punctuation),
                   invalid_pattern: str = r'(?:https?://\S+|www\.\S+)|@\w+',
                   lemmatizer: LemmatizationAdatper = lemmatizer,
                   replace_pattern: str = '#',
                   replace_value: str = '',
                   max_cache_size: int = 10000
                   ):
    
    tokens_memo = {}

    for token, tag in pos_tag(tweet_tokens):
        token = token.lower()
        
        if not (token in stop_words or token in invalid_symbols or re.search(invalid_pattern, token)):
            if replace_pattern:
                token = re.sub(replace_pattern, replace_value, token)
                
            if token in tokens_memo:
                yield tokens_memo[token]
            else:
                if len(tokens_memo) < max_cache_size:
                    tokens_memo[token] = lemmatizer.lemmatize(token)
                    yield tokens_memo[token]
                    
                else:
                    yield lemmatizer.lemmatize(token)

In [212]:
import re, string
from NLP.Lab3.helper.synsets.synsets_adapter import NLTKSynsetsAdapter, SynsetsAdapter

def process_tokens_(tweet_tokens: Iterable,
                   stop_words: set = set(stop_words),
                   invalid_symbols: set = set(string.punctuation),
                   invalid_pattern: str = r'(?:https?://\S+|www\.\S+)|@\w+',
                   synsets_processor: SynsetsAdapter = NLTKSynsetsAdapter(wn),
                   replace_pattern: str = '#',
                   replace_value: str = '',
                   max_cache_size: int = 10000
                   ):
    
    tokens_memo = {}

    for token, tag in pos_tag(tweet_tokens):
        token = token.lower()
        
        if not (token in stop_words or token in invalid_symbols or re.search(invalid_pattern, token)):
            if replace_pattern:
                token = re.sub(replace_pattern, replace_value, token)
                
            if token in tokens_memo:
                yield tokens_memo[token]
            else:
                if len(tokens_memo) < max_cache_size:
                    processed_token = synsets_processor.synsets(token)
                    tokens_memo[token] = processed_token[0].lemmas()[0].name() if processed_token else processed_token
                    yield tokens_memo[token]
                    
                else:
                    processed_token = synsets_processor.synsets(token)
                    yield processed_token[0].lemmas()[0].name() if processed_token else processed_token

In [213]:
print("Before:", tweet_tokens[50])
print("After", list(process_tokens(tweet_tokens[50])))

Before: ['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']
After ['rechargeable', 'normally', 'come', 'charger', 'u', 'buy', ':)']


In [214]:
print("Before:", tweet_tokens[50])
print("After:", list(process_tokens_(tweet_tokens[50])))

Before: ['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']
After: [[], [], 'come', [], [], 'buy', []]


In [215]:
# Function to compute semantic distance between two words
def semantic_distance(word1, word2):
    # Get the synsets for both words
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    
    if not synsets1 or not synsets2:
        return "No synsets found for one or both words."

    # Initialize minimum distance as infinity
    min_distance = float('inf')
    
    # Iterate over all synset combinations (cross-product of both words' synsets)
    for synset1 in synsets1:
        for synset2 in synsets2:
            # Get the hypernyms of both synsets
            hypernyms1 = synset1.hypernyms()
            hypernyms2 = synset2.hypernyms()

            # Compute path similarity between the two hypernyms
            for hyper1 in hypernyms1:
                for hyper2 in hypernyms2:
                    # Find the shortest path length to the common ancestor (root)
                    distance = synset1.shortest_path_distance(synset2)
                    if distance and distance < min_distance:
                        min_distance = distance
    
    if min_distance == float('inf'):
        return "No semantic path found."
    
    return min_distance

# Example usage
word1 = "dog"
word2 = "cat"
print(f"Semantic distance between '{word1}' and '{word2}': {semantic_distance(word1, word2)}")



Semantic distance between 'dog' and 'cat': 4


In [216]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(treebank_tag):
    """
    Convert the POS tag from treebank format to WordNet format.
    """
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def process_tokens(text):
    """
    Process the tokens in the text by finding the most appropriate WordNet synset.
    """
    # Tokenize the text and get POS tags
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    
    processed_tokens = []
    
    for word, tag in tagged_tokens:
        # Get the corresponding WordNet POS tag
        wordnet_pos = get_wordnet_pos(tag)
        
        if wordnet_pos:
            # Find synsets for the word with the specific part of speech
            synsets = wn.synsets(word, pos=wordnet_pos)
            
            if synsets:
                # Pick the first synset (you can use more sophisticated selection strategies here)
                best_synset = synsets[0]
                # Get the lemma name (the most canonical form of the word)
                processed_tokens.append(best_synset.lemmas()[0].name())
            else:
                # If no synsets found, append the original word
                processed_tokens.append(word)
        else:
            # If no suitable POS found, just append the original word
            processed_tokens.append(word)
    
    return processed_tokens

# Example usage:
text = "The dogs are running fast."
processed = process_tokens(text)
print(processed)


['The', 'dog', 'be', 'run', 'fast', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
