## Project 2: N-Gram Model

**Students:** Guillem Amat (ga98), Sebastián Soriano Pérez (ss1072)

### Importing Packages 

In [223]:
from typing import List, Tuple
from scipy.sparse import dok_matrix
import re
import pdb
import nltk
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/sebastiannw/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

### Algorithm: Markov Text Generation

**Algorithm**

In [220]:
def compute_n_matrix(n: int, corpus: List[str]):
    '''
    Computes a matrix (numpy array) with the conditional probabilities of a word appearing after an n-gram as
    determined by the corpus passed to this function. Returns the matrix (n_matrix), a dictionary of all the
    n-grams found in the corpus and their assigned indexes (n_gram_dictionary), and a dictionary of all the
    words (or tokens) found in the corpus and their assigned indexes (word_dictionary).
    
    Parameters
    ----------
    n : int 
        The length of n-grams.
    corpus : list of strings 
        A source corpus (list of tokens).
    
    Returns
    ----------
    n_matrix : numpy array 
        Array of conditional probabilities of a word appearing after an n-gram, given that n-gram. The columns
        correspond to unique words and the rows correspond to unique n-grams (as found in the corpus).
    n_gram_dictionary : dict
        Dictionary containing all unique n-grams found in the corpus as the keys. The values contain an index
        or int identifier for the n-gram.
    word_dictionary : dict
        Dictionary containing all unique words (or tokens) in the corpus as the keys. The values contain an
        index or int identifier for the words.
        
    '''
    # Returns n-grams from a corpus
    #pdb.set_trace()
    n_gram_list = [tuple(corpus[i:i + n]) for i in range(len(corpus[:-n]))]
    
    # Creates unique n-gram indexes
    distinct_n_grams  = list(set(n_gram_list))                                   # List of unique n-grams in corpus
    n_gram_dictionary = {n_gram: i for i, n_gram in enumerate(distinct_n_grams)} # Stores n-gram's distinct_n_gram index
    
    # Creates unique word indexes
    distinct_words  = list(set(corpus))                                  # List of unique words in corpus
    word_dictionary = {word: i for i, word in enumerate(distinct_words)} # Stores the word's distinct_words index
        
    # Creates an empty word-n-gram matrix to store the number of times each word follows an n-gram later on
    n_count_matrix = np.zeros((len(distinct_n_grams), len(distinct_words)))
    
    # Loops through each n-gram in corpus to fill out n_gram_matrix
    for i, n_gram in enumerate(n_gram_list[:-1]):
        # Finds the n-gram's index in distinct_n_grams
        n_gram_index = n_gram_dictionary[n_gram]
        
        # Finds the distinct_words's index of the word that follows the i-th n-gram (in the corpus)
        word_index = word_dictionary[corpus[i + n]]
        
        # Adds +1 to the count of the n_gram_index, word_index value in n_gram_matrix
        # (counts the number of times each word follows each n-gram as they appear in the corpus)
        n_count_matrix[n_gram_index, word_index] += 1
    
    # Creates matrix of conditional probabilities of each word following an n-gram given that n_gram appears
    n_matrix = n_count_matrix / np.sum(n_count_matrix, axis=1).reshape(-1, 1)
    n_matrix = np.nan_to_num(n_matrix)
    
    return n_matrix, n_gram_dictionary, word_dictionary

In [233]:
def finish_sentence(sentence: List[str], n: int, corpus: List[str], deterministic: bool =False) -> List[str]:
    '''
    Returns an extended sentence until the first ., ?, or ! is found OR until it has 15 total tokens.
    
    Parameters
    ----------
    sentence : list of strings 
        List of tokens that we’re trying to build on.
    n : int 
        The length of n-grams to use for prediction.
    corpus : list of strings 
        A source corpus (list of tokens).
    deterministic : bool
        A flag indicating whether the process should be deterministic.
    
    Returns
    ----------
    complete_sentence : list of strings 
        Extended sentence built with this Markov Text Generator.
    '''
    # Creates n_matrix to be filled with conditional probabilities of a word following an n-gram given that n-gram
    n_matrix          = {} # Dictionary with conditional probabilities
    n_gram_dictionary = {} # Dictionary with n-grams in corpus
    word_dictionary   = {} # Dictionary with words in corpus
    
    new_sentence = sentence.copy()      # Copy of sentence passed to finish_sentece, will be appended with new tokens
    n_gram       = tuple(sentence[-n:]) # First n-gram found in the original sentence
    
    while len(new_sentence) <= 15:
        #pdb.set_trace()
        for i in range(n, 0, -1):
            
            if i not in n_matrix.keys(): 
                # Stores dictionaries for the i value
                n_matrix[i], n_gram_dictionary[i], word_dictionary[i] = compute_n_matrix(i, corpus)
            
            if n_gram in n_gram_dictionary[i]:
                n_gram_index = n_gram_dictionary[i][n_gram]

                if deterministic:
                    new_word = list(word_dictionary[i].keys())[np.argmax(n_matrix[i][n_gram_index, :])]
                else:
                    new_word = np.random.choice(list(word_dictionary[i].keys()), p=n_matrix[i][n_gram_index, :])
                
                break
            
            n_gram = tuple(new_sentence[-(i - 1):])
            
            if i == 1: new_word = ','

        new_sentence.append(new_word)
        n_gram = tuple(new_sentence[-n:])
        
        if re.match(r'^[.?!]$', new_word): break
    
    return new_sentence

In [217]:
if re.match(r'^[.?!]$', '.'): print(True)

True


In [230]:
finish_sentence(sentence, n, corpus, True)

['Dessy',
 'is',
 'a',
 'very',
 'large',
 'one',
 ',',
 'i',
 'know',
 ',',
 'if',
 'the',
 'sum',
 'were',
 'diminished',
 'one']

<br>

### Test Cases

**Algorithm Test Cases**

In [231]:
sentence = ['she', 'was', 'not']
n        = 3
corpus   = [w.lower() for w in nltk.corpus.gutenberg.words('austen-sense.txt')]

In [20]:
finish_sentence(sentence, 3, corpus)

NameError: name 'finish_sentence' is not defined

<br>

### Appendix