# Project 2: Markov Text Generation

**Team members:** Guillem Amat (ga98), Sebastián Soriano Pérez (ss1072)

## Importing packages 

In [1]:
'''Importing packages'''
import numpy as np
import re
import pdb
import nltk
from typing import List, Tuple
import warnings
warnings.filterwarnings('ignore')
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/sebastiannw/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

## Algorithm: Markov Text Generation

**Defining functions**

In [2]:
def compute_n_matrix(n: int, corpus: List[str]):
    '''
    Computes a matrix (numpy array) with the conditional probabilities of a word appearing after an n-gram as
    determined by the corpus passed to this function. Returns the matrix (n_matrix), a dictionary of all the n-grams
    found in the corpus and their assigned indexes (n_gram_dictionary), and a dictionary of all the words (or tokens)
    found in the corpus and their assigned indexes (word_dictionary).
    
    Parameters
    ----------
    n : int 
        The length of n-grams.
    corpus : list of strings 
        A source corpus (list of tokens).
    
    Returns
    ----------
    n_matrix : numpy array 
        Array of conditional probabilities of a word appearing after an n-gram, given that n-gram. The columns
        correspond to unique words and the rows correspond to unique n-grams (as found in the corpus).
    n_gram_dictionary : dict
        Dictionary containing all unique n-grams found in the corpus as the keys. The values contain an index or int
        identifier for the n-gram.
    word_dictionary : dict
        Dictionary containing all unique words (or tokens) in the corpus as the keys. The values contain an index or
        int identifier for the words.
        
    '''
    # Returns n-grams from a corpus
    #pdb.set_trace()
    n_gram_list = [tuple(corpus[i:i + n]) for i in range(len(corpus[:-n]))]
    
    # Creates unique n-gram indexes
    distinct_n_grams  = list(set(n_gram_list))                                   # List of unique n-grams in corpus
    n_gram_dictionary = {n_gram: i for i, n_gram in enumerate(distinct_n_grams)} # Stores n-gram's distinct_n_gram 
                                                                                 # index
    
    # Creates unique word indexes
    distinct_words  = list(set(corpus))                                  # List of unique words in corpus
    word_dictionary = {word: i for i, word in enumerate(distinct_words)} # Stores the word's distinct_words index
        
    # Creates an empty word-n-gram matrix to store the number of times each word follows an n-gram later on
    n_count_matrix = np.zeros((len(distinct_n_grams), len(distinct_words)))
    
    # Loops through each n-gram in corpus to fill out n_gram_matrix
    for i, n_gram in enumerate(n_gram_list[:-1]):
        # Finds the n-gram's index in distinct_n_grams
        n_gram_index = n_gram_dictionary[n_gram]
        
        # Finds the distinct_words's index of the word that follows the i-th n-gram (in the corpus)
        word_index = word_dictionary[corpus[i + n]]
        
        # Adds +1 to the count of the n_gram_index, word_index value in n_gram_matrix (counts the number of times
        # each word follows each n-gram as they appear in the corpus)
        n_count_matrix[n_gram_index, word_index] += 1
    
    # Creates matrix of conditional probabilities of each word following an n-gram given that n_gram appears
    n_matrix = n_count_matrix / np.sum(n_count_matrix, axis=1).reshape(-1, 1)
    n_matrix = np.nan_to_num(n_matrix)
    
    return n_matrix, n_gram_dictionary, word_dictionary

In [3]:
def finish_sentence(sentence: List[str], n: int, corpus: List[str], deterministic: bool =False) -> List[str]:
    '''
    Returns an extended sentence until the first ., ?, or ! is found OR until it has 15 total tokens.
    
    Parameters
    ----------
    sentence : list of strings 
        List of tokens that we’re trying to build on.
    n : int 
        The length of n-grams to use for prediction.
    corpus : list of strings 
        A source corpus (list of tokens).
    deterministic : bool
        A flag indicating whether the process should be deterministic.
    
    Returns
    ----------
    new_sentence : list of strings 
        Extended sentence built with this Markov Text Generator.
    '''
    # Creates n_matrix to be filled with conditional probabilities of a word following an n-gram given that n-gram
    n_matrix          = {} # Dictionary with conditional probabilities
    n_gram_dictionary = {} # Dictionary with n-grams in corpus
    word_dictionary   = {} # Dictionary with words in corpus
    
    # Defines new_sentece to which new words will be appended, and the first n-gram from sentence
    new_sentence = sentence.copy()      # Copy of sentence passed to finish_sentece, will be appended with new tokens
    n_gram       = tuple(sentence[-n:]) # First n-gram found in the original sentence
    
    # Loops while new_sentence has 15 words (tokens) or less; Each iteration appends a new_word
    while len(new_sentence) <= 15:
        #pdb.set_trace()
        
        # Loops for i with values from n to 1 to find the word with the highest conditional probability of
        # appearing after the last i-gram in the current iteration of new_sentence, as it appears in corpus
        for i in range(n, 0, -1):
            
            # Checks if dictionary n_matrix has the matrix of contitional probabilities for the current i-gram value
            if i not in n_matrix.keys(): 
                # Creates and stores dictionaries for the i value if it doesn't exist yet
                n_matrix[i], n_gram_dictionary[i], word_dictionary[i] = compute_n_matrix(i, corpus)
            
            # Enters if the current i-gram (n_gram) appears in corpus (n_gram_dictionary[i])
            if n_gram in n_gram_dictionary[i]:
                # Obtains the index of the current i-gram in the current n_matrix[i]
                n_gram_index = n_gram_dictionary[i][n_gram]
                
                # Retrieves the new_word to be appended to new_sentence as defined by input parameter 'deterministic'
                if deterministic:
                    # Retrieves the new_word with the maximum conditional probability of appearing after the i-gram
                    # in corpus
                    new_word = list(word_dictionary[i].keys())[np.argmax(n_matrix[i][n_gram_index, :])]
                else:
                    # Chooses a new_word randomly following the conditional probabilities defined in n_matrix[i]
                    new_word = np.random.choice(list(word_dictionary[i].keys()), p=n_matrix[i][n_gram_index, :])
                
                # Breaks the loop of i from n to i if the current i-gram (n_gram) appeared in corpus
                break
            
            # Gets here if current i-gram (n_gram) wasn't in corpus; Redefines it as an (i - 1)-gram (n_gram) 
            n_gram = tuple(new_sentence[-(i - 1):])
            
            # If the last i-gram (n_gram with n=1) wasn't found at the end of the loop, defines the new word to be ','
            if i == 1: new_word = ','
        
        # Appends the new_word to new_sentence
        new_sentence.append(new_word)
        
        # Breaks and stops appending to new_sentence if the last word (token) appended was '.', '?', or '!'
        if re.match(r'^[.?!]$', new_word): break
            
        # Redefines the n_gram to be the last n-gram found in new_sentece (with its new appeneded word)
        n_gram = tuple(new_sentence[-n:])
    
    return new_sentence

## Implementation

**Test cases**

In [4]:
'''Fixed variables for all test cases'''
corpus = [w.lower() for w in nltk.corpus.gutenberg.words('austen-sense.txt')]

In [8]:
'''Test case 1'''
sentence = ['she', 'was', 'not']
n        = 3
finish_sentence(sentence, n, corpus, True)

['she',
 'was',
 'not',
 'only',
 'without',
 'affection',
 'for',
 'the',
 'person',
 'who',
 'was',
 'to',
 'be',
 'the',
 'remains',
 'of']

In [6]:
'''Test case 2'''
sentence = ['she', 'was', 'not']
n        = 3
finish_sentence(sentence, n, corpus)

['she',
 'was',
 'not',
 'at',
 'liberty',
 'to',
 'tend',
 'to',
 'their',
 'orders',
 ';',
 'and',
 'they',
 'sat',
 'down',
 'together']