<a href="https://colab.research.google.com/github/Henok-Matheas/ngrams/blob/main/ngrams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [128]:
corpus_file = "/content/GPAC.txt"

with open(corpus_file, "r", encoding="utf-8", errors="ignore") as f:
    corpus_text = f.read()

In [129]:
import nltk
import re
nltk.download("punkt")
from nltk import ngrams
from nltk.probability import FreqDist, ConditionalFreqDist, ConditionalProbDist
from nltk.tokenize import word_tokenize
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [130]:
def pre_process(text, pad_size = 0, pad_symbol = ''):
  stopwords_file = "/content/stopwords.txt"

  with open(stopwords_file, "r", encoding="utf-8", errors="ignore") as f:
      stopwords = f.read()


  # Remove punctuation
  text = re.sub('[\!\@\#\$\%\^\«\»\&\*\(\)\…\[\]\{\}\;\“\”\›\’\‘\"\'\:\,\.\‹\/\<\>\?\\\\|\`\´\~\-\=\+\፡\፤\;\፦\፥\፧\፨\፠\፣]', '', text)

  # change \። with padding
  padding = (" " + pad_symbol + " ") * pad_size
  text = re.sub('\።', padding if padding else "", text)

  # Tokenize the text into words
  tokens = word_tokenize(text)

  # Remove stopwords
  stop_words = set(stopwords)
  tokens = [token for token in tokens if token.lower() not in stop_words]

  return tokens

In [131]:
def create_ngram(tokens, gram_size, pad_symbol, pad_size):
  return list(ngrams(list(nltk.pad_sequence(tokens, pad_size, pad_left=True, pad_right=True, left_pad_symbol=pad_symbol, right_pad_symbol=pad_symbol)), gram_size))

In [134]:
test_size = 0.6

unigram_total_tokens = pre_process(corpus_text)
unigram_tokens, unigram_test_tokens = unigram_total_tokens[:int(len(unigram_total_tokens) * test_size)], unigram_total_tokens[int(len(unigram_total_tokens) * test_size):]
unigrams = create_ngram(unigram_tokens, 1, "<PAD>", 1)
print(unigrams[:5])

['ምን', 'መሰላችሁ', 'አንባቢያን', 'ኢትዮጵያ', 'በተደጋጋሚ']
[('ምን',), ('መሰላችሁ',), ('አንባቢያን',), ('ኢትዮጵያ',), ('በተደጋጋሚ',)]


In [135]:
bigram_total_tokens = pre_process(corpus_text, 1, "<PAD>")
bigram_tokens, bigram_test_tokens = bigram_total_tokens[:int(len(bigram_total_tokens) * test_size)], bigram_total_tokens[int(len(bigram_total_tokens) * test_size):]
bigrams = create_ngram(bigram_tokens, 2, "<PAD>", 1)
print(bigrams[:5])

[('ምን', 'መሰላችሁ'), ('መሰላችሁ', 'አንባቢያን'), ('አንባቢያን', 'ኢትዮጵያ'), ('ኢትዮጵያ', 'በተደጋጋሚ'), ('በተደጋጋሚ', 'ጥሪው')]


In [136]:
trigram_total_tokens = pre_process(corpus_text, 2, "<PAD>")
trigram_tokens, trigram_test_tokens = trigram_total_tokens[:int(len(trigram_total_tokens) * test_size)], trigram_total_tokens[int(len(trigram_total_tokens) * test_size):]
trigrams = create_ngram(trigram_tokens, 3, "<PAD>", 1)
print(trigrams[:5])

[('ምን', 'መሰላችሁ', 'አንባቢያን'), ('መሰላችሁ', 'አንባቢያን', 'ኢትዮጵያ'), ('አንባቢያን', 'ኢትዮጵያ', 'በተደጋጋሚ'), ('ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው'), ('በተደጋጋሚ', 'ጥሪው', 'ደርሷት')]


In [137]:
fourgram_total_tokens = pre_process(corpus_text, 3, "<PAD>")
fourgram_tokens, fourgram_test_tokens = fourgram_total_tokens[:int(len(fourgram_total_tokens) * test_size)], fourgram_total_tokens[int(len(fourgram_total_tokens) * test_size):]
fourgrams = create_ngram(fourgram_tokens, 4, "<PAD>", 1)
print(fourgrams[:5])

[('ምን', 'መሰላችሁ', 'አንባቢያን', 'ኢትዮጵያ'), ('መሰላችሁ', 'አንባቢያን', 'ኢትዮጵያ', 'በተደጋጋሚ'), ('አንባቢያን', 'ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው'), ('ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው', 'ደርሷት'), ('በተደጋጋሚ', 'ጥሪው', 'ደርሷት', 'ልትታደመው')]


In [138]:
from collections import Counter

def top_n_frequent(ngrams, top_n):
  # Calculate counts
  ngram_counts = Counter(ngrams)

  ngram_total_count = sum(ngram_counts.values())

  ngram_probabilities = {gram: count / ngram_total_count for gram, count in ngram_counts.items()}

  # return the n most frequent n-grams
  return sorted(ngram_probabilities.items(), key=lambda x: x[1], reverse=True)[:top_n]

In [139]:
#top 10 n-grams
top_n = 10

print("Top 10 Unigrams:", top_n_frequent(unigrams, top_n))
print("Top 10 Bigrams:", top_n_frequent(bigrams, top_n))
print("Top 10 Trigrams:", top_n_frequent(trigrams, top_n))
print("Top 10 Fourgrams:", top_n_frequent(fourgrams, top_n))

Top 10 Unigrams: [(('ነው',), 0.017193487115863902), (('ላይ',), 0.009529978810070157), (('ነበር',), 0.0056433284658710375), (('ግን',), 0.005423743700662048), (('ወደ',), 0.005259055126755305), (('ውስጥ',), 0.004764989405035078), (('ጋር',), 0.004270923683314852), (('እና',), 0.004029380441584963), (('ነገር',), 0.0038866503441991195), (('እንደ',), 0.0032827922398743973)]
Top 10 Bigrams: [(('ነገር', 'ግን'), 0.0007795344751866491), (('ብቻ', 'ሳይሆን'), 0.0006038647342995169), (('ማለት', 'ነው'), 0.0005928853754940712), (('ብቻ', 'ነው'), 0.0005379885814668423), (('ናርኮ', 'ጋንጐች'), 0.0003952569169960474), (('ኢብን', 'ባቱታ'), 0.0003184014053579271), (('ወደ', 'ኋላ'), 0.0002964426877470356), (('አዲስ', 'አበባ'), 0.0002854633289415898), (('በስድስት', 'ወር'), 0.0002854633289415898), (('አንድ', 'ቀን'), 0.00027448397013614406)]
Top 10 Trigrams: [(('ሀሁ', 'በስድስት', 'ወር'), 0.00021958958706178153), (('ሀሁ', 'ወይም', 'ፐፑ'), 0.00016469219029633614), (('ያም', 'ሆነ', 'ይህ'), 0.00012077427288397984), (('ጋር', 'ሆኖ', 'ኢህአዴግን'), 0.00010979479353089076), (('ሆኖ', 'ኢህአ

In [140]:
def create_model(ngrams, n = 1):
  # Create conditional frequency distribution for n-grams
  dist_list = []
  for words in ngrams:
    if n == 1:
      dist_list.append(list(words)[0])
    elif n == 2:
      dist_list.append((list(words)[0], words[-1]))
    else:
      dist_list.append((tuple(list(words[:-1])), words[-1]))

  if n == 1:
    return FreqDist(dist_list)

  # Calculate the conditional probability of each n-gram
  return nltk.ConditionalProbDist(ConditionalFreqDist(dist_list), nltk.MLEProbDist)

In [141]:
# create models
unigram_model = create_model(unigrams)
bigram_model = create_model(bigrams, n = 2)
trigram_model = create_model(trigrams, n = 3)
fourgram_model = create_model(fourgrams, n = 4)

In [142]:
def find_probability(model, sentence_ngram, n = 1):
  probability = 1.0
  total = None if n > 1 else model.N()

  for words in sentence_ngram:
      if n < 3:
        condition = list(words)[0]
      else:
        condition = tuple(list(words)[:-1])

      event = words[-1] if n > 1 else None
      if n > 1:
        word_probability = model[condition].prob(event)
      else:
        word_probability = model[condition] / total
      probability *= word_probability

  return probability

In [144]:
sentence = "ጋር ሆኖ ኢህአዴግን ወጋ"

sentence_unigram_tokens = pre_process(sentence)
sentence_unigrams = create_ngram(sentence_unigram_tokens, 1, "<PAD>", 1)

sentence_bigram_tokens = pre_process(sentence, 1, "<PAD>")
sentence_bigrams = create_ngram(sentence_bigram_tokens, 2, "<PAD>", 1)

sentence_trigram_tokens = pre_process(sentence, 2, "<PAD>")
sentence_trigrams = create_ngram(sentence_trigram_tokens, 3, "<PAD>", 1)

sentence_fourgram_tokens = pre_process(sentence, 3, "<PAD>")
sentence_fourgrams = create_ngram(sentence_fourgram_tokens, 4, "<PAD>", 1)

In [145]:
unigram_probability = find_probability(unigram_model, sentence_unigrams)
bigram_probability = find_probability(bigram_model, sentence_bigrams, 2)
trigram_probability = find_probability(trigram_model, sentence_trigrams, 3)
fourgram_probability = find_probability(fourgram_model, sentence_fourgrams, 4)

print("Unigram Probability:", unigram_probability)
print("Bigram Probability:", bigram_probability)
print("Trigram Probability:", trigram_probability)
print("Fourgram Probability:", fourgram_probability)

Unigram Probability: 1.6539133866028692e-13
Bigram Probability: 0.004392847039308952
Trigram Probability: 0.4
Fourgram Probability: 1.0


In [146]:
import random

def generate_sentence(ngram_models, start_words = None, word_size = 100, n = 1):
    sentences = []
    while word_size:
      if n == 1:
        start_word = random.choice(list(ngram_models.keys()))
        sentences.append(start_word)

      elif start_words is None:
          if n == 2:
            start_words = random.choice(list(ngram_models.conditions()))
            sentences.append(start_words)
          else:
            start_words = list(random.choice(list(ngram_models.conditions())))
            sentences.extend(start_words)
      else:
          if n == 2:
              cpd = ngram_models[start_words]
          else:
            cpd = ngram_models[tuple(start_words)]

          if cpd is None:
              break
          next_word = cpd.generate()
          if next_word is None:
              break
          sentences.append(next_word)
          start_words = sentences[-(n-1):] if n != 2 else sentences[-1]

      word_size -= 1


    return " ".join(sentences)


As n increases the content of the generates sentences make more and more sense, this is due to having more of a history to work with.

In [147]:
unigram_generated_sentence = generate_sentence(unigram_model, start_words = None, word_size = 10, n = 1)
bigram_generated_sentence = generate_sentence(bigram_model, start_words = None, word_size = 20, n = 2)
trigram_generated_sentence = generate_sentence(trigram_model, start_words = None, word_size = 20, n = 3)
fourgram_generated_sentence = generate_sentence(fourgram_model, start_words = None, word_size = 20, n = 4)


print("Unigram Generated sentence:", unigram_generated_sentence)
print("Bigram Generated sentence:", bigram_generated_sentence)
print("Trigram Generated sentence:", trigram_generated_sentence)
print("Fourgram Generated sentence:", fourgram_generated_sentence)

Unigram Generated sentence: እንደጻፈው ተቀምጧልኢጣሊያዊ ግጥሚያም በዘመናት ሞኝነት የሳሚን እያንቀላፋ አምቆ ሮገር ይደርስ
Bigram Generated sentence: ከሚጠቀሱባቸው ምሳሌያዊ አነጋገር መነሻ ጥያቄዎቹን ሥርዓት አምነው የተጠመቁ መንግሥታትም ዋነኛ መልእክታቸው የተሸከምነውን አድገን ተምረንና ስራ እንዳላከናወነ እንዲሰማቸው ሊያደርግ ይችላል የሚል
Trigram Generated sentence: ጥላው የዚያ የራስህን መልክ ይዞ በሚያንፀባርቅ ነገር ላይ ትልከሰከሳለህ እንዴ የአህያ ስጋ አልጋ ሲሉት አመድ አቶ ያየህ ጥናቱን በተሰጠው ጊዜ ውስጥ
Fourgram Generated sentence: ህልም ከችላ ባይነት ከስንፍና ወይም ከፍላጐተ ቢስነት አይፈጠርም የስኬታማ ሰዎች ሌላው መርህ ለመሥራት ፈቃደኛ መሆን ነው አንድ ድንቅ የሆነ ነገር ለመፍጠር ሁሌም


**Evaluate these Language Models Using Intrinsic Evaluation Method**

In [172]:
import math

def calculate_perplexity(ngram_model, test_tokens, n):
    total_probability = 0
    total = len(test_tokens)

    for idx in range(total - n + 1):
        if n == 2:
          condition = list(test_tokens[idx: idx + n - 1])[0]
        else:
          condition = tuple(test_tokens[idx: idx + n - 1])

        event = test_tokens[idx + n - 1]
        if n > 1:
          probability = math.log(ngram_model[condition].prob(event)) if ngram_model[condition].prob(event) > 0 else 1 / total
        else:
          probability = math.log(ngram_model[event] / total) if ngram_model[event] > 0 else 1 / total
        total_probability -= probability

    return math.exp(total_probability / total)

In [173]:
print("Unigram Perplexity: ", calculate_perplexity(unigram_model, unigram_test_tokens, 1))
print("Bigram Perplexity: ", calculate_perplexity(bigram_model, bigram_test_tokens, 2))
print("Trigram Perplexity: ", calculate_perplexity(trigram_model, trigram_test_tokens, 3))
print("Fourgram Perplexity: ", calculate_perplexity(fourgram_model, fourgram_test_tokens, 4))

Unigram Perplexity:  277.9982302804208
Bigram Perplexity:  1.3341067891325795
Trigram Perplexity:  1.0057939456694318
Fourgram Perplexity:  1.0001197066987138


**Evaluate these Language Models Using Extrinsic Evaluation Method**