In [1]:
import string
import random
import time
from typing import List

In [2]:
def tokenize(text: str) -> List[str]:
    """
    :param text: Takes input sentence
    :return: tokenized sentence
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()
    return t

In [3]:
def get_ngrams(n: int, tokens: list) -> list:
    """
    :param n: n-gram size
    :param tokens: tokenized sentence
    :return: list of ngrams
    ngrams of tuple form: ((previous wordS!), target word)
    """
    # tokens.append('<END>')
    tokens = (n-1)*['<START>']+tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
    return l

In [4]:
class NgramModel(object):

    def __init__(self, n):
        self.n = n

        # dictionary that keeps list of candidate words given context
        self.context = {}

        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    def update(self, sentence: str) -> None:
        """
        Updates Language Model
        :param sentence: input text
        """
        n = self.n
        ngrams = get_ngrams(n, tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]

    def prob(self, context, token):
        """
        Calculates probability of a candidate token to be generated given a context
        :return: conditional probability
        """
        try:
            count_of_token = self.ngram_counter[(context, token)]
            count_of_context = float(len(self.context[context]))
            result = count_of_token / count_of_context

        except KeyError:
            result = 0.0
        return result

    def random_token(self, context):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        token_of_interest = self.context[context]
        for token in token_of_interest:
            map_to_probs[token] = self.prob(context, token)

        summ = 0
        for token in sorted(map_to_probs):
            summ += map_to_probs[token]
            if summ > r:
                return token

    def generate_text(self, token_count: int):
        """
        :param token_count: number of words to be produced
        :return: generated text
        """
        n = self.n
        context_queue = (n - 1) * ['<START>']
        result = []
        for _ in range(token_count):
            obj = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if obj == '.':
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)
        return ' '.join(result)


def create_ngram_model(n, path):
    m = NgramModel(n)
    with open(path, 'r') as f:
        text = f.read()
        text = text.split('.')
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            m.update(sentence)
    return m



if __name__ == "__main__":
    start = time.time()
    m = create_ngram_model(6, '01Genesis.txt')

    print (f'Language Model creating time: {time.time() - start}')
    start = time.time()
    random.seed(7)
    print(f'{"="*50}\nGenerated text:')
    print(m.generate_text(20))
    print(f'{"="*50}')

Language Model creating time: 0.26215481758117676
Generated text:
29He gave them his last charge and said , ‘I shall soon be gathered to my father’s kin ; bury


In [6]:
import nltk

Ex 2

In [7]:
nltk.download("brown")
nltk.download("universal_tagset")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [9]:
rel = nltk.corpus.brown.tagged_words(categories="religion", tagset="universal")
rom = nltk.corpus.brown.tagged_words(categories="romance", tagset="universal")

In [20]:
rel_clean = sorted(
    set([word for (word, tag) in rel if tag == "NOUN" and word.isalpha() and not word[0].isupper()])
)

romn_clean = sorted(
    set([word for (word, tag) in rom if tag == "NOUN" and word.isalpha() and not word[0].isupper()])
)


In [11]:
from nltk.corpus import wordnet as wn

In [14]:
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [12]:
type = "n"

In [15]:
synsets = wn.all_synsets(type)

In [16]:
def find_polysemy(text):

    count = 0

    for w in text:
        count += len(wn.synsets(w))

    return count / len(text)

In [21]:
n_rel = find_polysemy(rel_clean)
print(n_rel)



5.917355371900826


In [22]:
n_romn = find_polysemy(romn_clean)
print(n_romn)

6.017577500798978


In [23]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


def find_meronyms(text):

    count_part_m = 0
    count_subst_m = 0

    for w in text:

        try:

            word = wn.synset(f"{lemmatizer.lemmatize(w)}.n.01")

            count_part_m += len(word.part_meronyms())
            count_subst_m += len(word.substance_meronyms())

        except Exception as e:
            print(e)

    return count_part_m, count_subst_m


In [24]:
count_part_m, count_subst_m = find_meronyms(rel_clean)

no lemma 'anchoritism' with part of speech 'n'
no lemma 'anyone' with part of speech 'n'
no lemma 'anything' with part of speech 'n'
no lemma 'bonzes' with part of speech 'n'
no lemma 'carryovers' with part of speech 'n'
no lemma 'compromising' with part of speech 'n'
no lemma 'deadweight' with part of speech 'n'
no lemma 'doing' with part of speech 'n'
no lemma 'electrodynamics' with part of speech 'n'
no lemma 'enjoinder' with part of speech 'n'
no lemma 'everybody' with part of speech 'n'
no lemma 'everyone' with part of speech 'n'
no lemma 'everything' with part of speech 'n'
no lemma 'falling' with part of speech 'n'
no lemma 'filmstrips' with part of speech 'n'
no lemma 'finite' with part of speech 'n'
no lemma 'foreshortening' with part of speech 'n'
no lemma 'heightening' with part of speech 'n'
no lemma 'historicity' with part of speech 'n'
no lemma 'indigation' with part of speech 'n'
no lemma 'metaphysic' with part of speech 'n'
no lemma 'others' with part of speech 'n'
no l

In [25]:
print(count_part_m + count_subst_m)

786


In [26]:
count_part_m, count_subst_m = find_meronyms(rel_clean)
print(count_part_m + count_subst_m)

no lemma 'anchoritism' with part of speech 'n'
no lemma 'anyone' with part of speech 'n'
no lemma 'anything' with part of speech 'n'
no lemma 'bonzes' with part of speech 'n'
no lemma 'carryovers' with part of speech 'n'
no lemma 'compromising' with part of speech 'n'
no lemma 'deadweight' with part of speech 'n'
no lemma 'doing' with part of speech 'n'
no lemma 'electrodynamics' with part of speech 'n'
no lemma 'enjoinder' with part of speech 'n'
no lemma 'everybody' with part of speech 'n'
no lemma 'everyone' with part of speech 'n'
no lemma 'everything' with part of speech 'n'
no lemma 'falling' with part of speech 'n'
no lemma 'filmstrips' with part of speech 'n'
no lemma 'finite' with part of speech 'n'
no lemma 'foreshortening' with part of speech 'n'
no lemma 'heightening' with part of speech 'n'
no lemma 'historicity' with part of speech 'n'
no lemma 'indigation' with part of speech 'n'
no lemma 'metaphysic' with part of speech 'n'
no lemma 'others' with part of speech 'n'
no l

In [27]:
q = "worries"
tree = wn.synset(f"{q}.n.01")

tree.part_meronyms()


WordNetError: ignored

In [28]:
wn.synsets("worries", wn.NOUN)

[Synset('concern.n.04'), Synset('worry.n.02')]

Ex 3

In [31]:
lemmas = []
for synset in synsets:
  for lemma in synset.lemmas():
    lemmas.append(lemma.name())

In [32]:
lemmas = set(lemmas)

In [33]:
count = 0
for lemma in lemmas:
  count = count + len(wn.synsets(lemma, type))

In [34]:
print('Total distinct lemmas: ', len(lemmas))
print('Total senses :',count)
print('Average Polysemy of ', type,': ' , count/len(lemmas))

Total distinct lemmas:  119034
Total senses : 152763
Average Polysemy of  n :  1.2833560159282222
