In [14]:
import numpy as np

from collections import Counter
from itertools import islice
import re
from pprint import pprint

import nltk
from nltk.collocations import *
from nltk import ngrams

# nltk.download('punkt')

In [184]:
class Restaurants(object) :
    def __init__(self, global_dish_list):
        self.restaurants_dict = {}
        self.global_dish_list = global_dish_list
        
    def add_restaurant(self, context, restaurant) :
        self.restaurants_dict[context] = restaurant
    
    def draw_new_customer(self, context, d, theta) :
        self.restaurants_dict[context].draw_new_customer(self, d, theta)
        
    def dish_probability(self, context, d, theta) :
        self.restaurants_dict[context].dish_probability(self, d, theta)
        
class Restaurant(object) :
    def __init__(self, customers_per_table , dishes, context):
        self.nb_tables = len(customers_per_table)
        self.nb_dishes = len(set(dishes))
        self.context = context
        
        self.customers_per_table = customers_per_table
        self.dishes = dishes
        self.dishes_counts = occurences_counts(dishes)
        
    def draw_new_customer(self, d, theta):
        # TODO increment the counts
        if len(context) == 0 :
            table = draw_table(self.customers_per_table, self.dishes_counts, d, theta)
            # If the table is a new table
            if table == nb_tables : 
                customer = np.random.choice(super.dish_list)
            else :
                # TODO : Check the chosen word
                customer = self.dishes[table]

        else :
            table = draw_table(self.customers_per_table, self.dishes_counts, d, theta)

            # If the table is a new table
            if table == nb_tables : 
                sub_context = self.context
                del sub_context[0]
                customer = super.draw_word(sub_context, d, theta)
            else :
                # TODO : Check the chosen word
                customer = self.dishes[table]

        return customer
    
    def dish_probability(self, d, theta):
    # TODO increment the counts
    if len(context) == 0 :
        table = draw_table(self.customers_per_table, self.dishes_counts, d, theta)
        # If the table is a new table
        if table == nb_tables : 
            customer = np.random.choice(super.dish_list)
        else :
            # TODO : Check the chosen word
            customer = self.dishes[table]

    else :
        table = draw_table(self.customers_per_table, self.dishes_counts, d, theta)

        # If the table is a new table
        if table == nb_tables : 
            sub_context = self.context
            del sub_context[0]
            customer = super.draw_word(sub_context, d, theta)
        else :
            # TODO : Check the chosen word
            customer = self.dishes[table]

    return customer

In [None]:
def word_probability(word, context, n_gram_counters, d, theta, word_list):
    """ Returns the probability that the word after the context will be w"""
    
    pitman_process_level = len(context)
    if pitman_process_level > 0 :
        context_word_counts = context_counts(context, n_gram_counters[pitman_process_level])
    else :
        context_word_counts = np.array([item for item in n_gram_counters[0].items()])
    
    nb_words = len(word_list)
    nb_tables = context_word_counts.shape[0]
        
    if len(context) == 0 :
        probability_distribution = compute_pyp_probabilities(context_word_counts, d, theta)
    
    else :
        table = draw_table(context_word_counts, d, theta)
        
        # If the table is a new table
        if table == nb_tables : 
            del context[0]
            customer = draw_word(context, n_gram_counters, d, theta, word_list)
        else :
            # TODO : Check the chosen word
            customer = context_word_counts[table, 0]

    return customer[0]

In [92]:
def n_gram_counter(file_data, n):
    counts = Counter()
    tokens = nltk.word_tokenize(file_data)
    ngram = ngrams(tokens, n)
    for n_gram in ngram :
        counts[n_gram] += 1
    return np.array([list(n_gram_count[0]) + [n_gram_count[1]]  for n_gram_count in list(counts.items())])

def context_counts(context, n_gram_counter):
    n = len(context)
    return np.array([[item[0][n], item[1]] 
                     for item in n_gram_counter.items() if "".join(item[0][0:n]) == "".join(context)])

def genrate_word_list(file_data) :
    tokens = nltk.word_tokenize(file_data)
    ngram = ngrams(tokens, n=1)
    return [word[0] for word in ngram]

def occurences_counts(input_list):
    counter = Counter(input_list)
    return [counter[input_list[i]] for i in range(len(input_list))]

def compute_pyp_probabilities(customer_counts, dish_counts, d, theta) :
    """ Compute the tables probabilities with respect to the PYP distribution
    The last probability is the new table probability"""
    # TODO : Adapt the algorithm so that more than one table can hold a word
    # TODO Adapt the table count size
    table_counts_per_type = list(np.ones(len(customer_counts)))

    nb_tables = len(table_counts_per_word)
    nb_customers = sum(customer_counts)

    old_tables_probabilities = [(customer_counts[i] - d * dish_counts[i] ) / (nb_customers + theta)
                                    for i in range(nb_tables)] 
    new_table_probability = [(theta + d * nb_tables) / (nb_customers + theta)]
    all_probabilities = old_tables_probabilities + new_table_probability
    
    assert np.sum(all_probabilities) == 1
    
    return all_probabilities

In [18]:
def draw_table(customer_list, d, theta):
    """ Draws a table with respect to PYP 
    The last table is a new table"""
    all_probabilities = compute_pyp_probabilities(customer_counts, dish_counts, d, theta)
    
    picked_table = np.argmax(np.random.multinomial(n=1, pvals=all_probabilities))
    return picked_table

In [106]:
def fill_restaurants(train_data) :
    
    # TODO : convert to strnig of tuple
    
    # Fill the restaurants :
    word_list = genrate_word_list(train_data)
    restaurants = Restaurants(global_dish_list=word_list)

    # Unigrams :
    unigram_table =  n_gram_counter(train_data, n=1)
    words = unigram_table[:, 0]
    counts = unigram_table[:, 1].astype(int)
    restaurants.add_restaurant(restaurant=Restaurant(customers_per_table=counts,
                                          dishes=words, 
                                          context=""),
                              context="")

    # Bigrams : 
    bigram_table =  n_gram_counter(train_data, n=2)
    nb_bigrams = bigram_table.shape[0] 
    words = bigram_table[:, 0:2]
    counts = bigram_table[:, 2].astype(int)
    unique_pregrams = np.array(list(set(words[:, 0])))

    for pregram in unique_pre_grams :
        to_keep = np.where(bigram_table[:, 0] == pregram)
        sub_counts = bigram_table[to_keep, 1:3][0]
        restaurants.add_restaurant(restaurant = Restaurant(customers_per_table=sub_counts[:,1],
                                                          dishes=sub_counts[:,0].astype(int), 
                                                          context=pregram),
                                   context=pregram) 

    # Trigrams :
    trigram_table =  n_gram_counter(train_data, n=3)
    nb_trigrams = trigram_table.shape[0] 
    words = trigram_table[:, 0:3]
    counts = trigram_table[:, 3].astype(int)

    pregrams = [str(item) for item in zip(words[:, 0], words[:, 1])]
    unique_pre_grams = list(set(pregrams))
    pregrams = np.array(pregrams)
    for pregram in unique_pre_grams :
        to_keep = np.where(pregrams == pregram)
        sub_counts = trigram_table[to_keep, 2:4][0]
        restaurants.add_restaurant(restaurant = Restaurant(customers_per_table=sub_counts[:,1],
                                                          dishes=sub_counts[:,0].astype(int), 
                                                          context=pregram),
                                   context=pregram)
    return restaurants

In [185]:
train_file = "Verne.5semaines.en"

with open("data/" + train_file, "r") as text_file :
    train_data = ''.join(text_file.read().split('\n'))

restaurants = fill_restaurants(train_data)
restaurants.draw_new_customer(context="he", d=.5, theta=.5)

AttributeError: 'Restaurants' object has no attribute 'draw_new_customer'

# Particle Filter

## Sentence sampling

In [None]:
# Importance sampling


