# Statistical Model Languages

Tarea 3 - Luis Eduardo Robles Jimenez

## Language Model and Evaluation

In [23]:
import nltk
import numpy as np
from nltk.tokenize import TweetTokenizer

### 1. Preprocessing

In [24]:
tokens = dict()
tokens['begin'], tokens['end'], tokens['unknown'] = '<s>', '</s>', '<unk>'

In [25]:
def _create_vocabulary(corpus, size): # Returns words sorted by frequency
    words, tokenizer, corpusByWords = [], TweetTokenizer(), []
    for doc in corpus:
        tokens = tokenizer.tokenize(doc)
        words += tokens
        corpusByWords += [tokens]
    count = nltk.FreqDist(words)
    count = sorted([(count[key], key) for key in count])[::-1]
    if size != -1: count = count[:size]
    return [word for _, word in count], corpusByWords

def load_corpus(corpus_select = "tweets", vocabSize = 100):
    corpus = []
    path_corpus = "../../data/agresividad/mex_train.txt"

    with open(path_corpus, "r") as f_corpus:
        for tuit in f_corpus:
            corpus += [tuit[:-1]]
    
    vocab, tokenized = _create_vocabulary(corpus, vocabSize)
    corpus = []
    for doc in tokenized:
        tweet = []
        tweet.append(tokens['begin'])
        for word in doc: 
            tweet.append(tokens['unknown'] if word.lower() not in vocab else word)
        tweet.append(tokens['end'])
        corpus.append(tweet)
    return corpus

In [26]:
corpus = load_corpus(vocabSize = -1)

#### Comment

### 2. Models Training

#### Good-Turing Smoothing

Formula

$$

P_{GT}(x) = 
\begin{cases}
    \frac{N_1}{N},    & \text{if } c = 0\\
    \frac{c^*}{N},    & \text{otherwise}
\end{cases}
$$

Where

$$ c^* = \frac{(c + 1)N_{c + 1}}{N_c} $$

and

$c \text{ is the number of times a word was seen}$

$N \text{ is the number of words in the corpus (not vocabulary)}$

$N_i \text{ is the number of words with frequency } i$

In [27]:
class LanguageModel:
    def __init__(self, corpus = None):
        self.corpus = corpus
        self.nGrams = dict()
        self.Ni = None
    def P(self, *words):
        pass
    def GoodTuring(self, *words):       # WHAT HAPPENS WHEN ASKING FOR THE MOST FREQUENT WORD?
        pass
class Unigram(LanguageModel):
    def __init__(self, corpus = None):
        super().__init__(corpus)
        self.N = 0
        for line in corpus:
            for word in line:
                if not word in self.nGrams: self.nGrams[word] = 0
                self.nGrams[word] += 1
                self.N += 1
        v, c = np.unique(list(self.nGrams.values()), return_counts = True)
        self.Ni = np.zeros((v[-1] + 3))
        for freq, count in zip(v, c): 
            self.Ni[freq] = count
        print(np.mean(self.Ni))
        print(np.unique(self.Ni, return_counts = True))
    '''
    def P(self, *words):    # Only one n-gram
        assert len(words) == 1, 'Trying to calculate the probability of more than one word'
        word = words[0] if words[0] in self.nGrams else tokens['unknown']
        return self.nGrams[word]
    '''
    def GoodTuring(self, *words):
        assert len(words) == 1, 'Trying to calculate the probability of more than one word'
        wFreq = self.nGrams[words[0]] if words[0] in self.nGrams else 0
        print(f'\t{wFreq}')
        numerator = self.Ni[1] if wFreq == 0 else (wFreq + 1)*self.Ni[wFreq + 1]/self.Ni[wFreq]
        print(f'\t{numerator}')
        return numerator / self.N

In [28]:
uni = Unigram(corpus)

2.4485307373354965
(array([0.000e+00, 1.000e+00, 2.000e+00, 3.000e+00, 4.000e+00, 5.000e+00,
       6.000e+00, 7.000e+00, 8.000e+00, 9.000e+00, 1.000e+01, 1.200e+01,
       1.400e+01, 1.600e+01, 2.100e+01, 2.300e+01, 2.400e+01, 2.800e+01,
       3.400e+01, 3.500e+01, 3.600e+01, 3.700e+01, 3.900e+01, 5.400e+01,
       7.300e+01, 8.400e+01, 1.030e+02, 1.200e+02, 1.480e+02, 1.830e+02,
       2.930e+02, 4.770e+02, 8.850e+02, 1.841e+03, 8.602e+03]), array([5357,  101,   23,    9,    7,    8,    3,    7,    1,    2,    3,
          1,    2,    1,    2,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1], dtype=int64))
[0.000e+00 8.602e+03 1.841e+03 ... 2.000e+00 0.000e+00 0.000e+00]


In [29]:
print(uni.GoodTuring("de"))
print(uni.GoodTuring("que"))
print(uni.GoodTuring("<s>"))
print(uni.GoodTuring("</s>"))
print(uni.GoodTuring("<unk>"))
print(uni.GoodTuring("otorrino"))

	3357
	0.0
0.0
	3383
	0.0
0.0
	5544
	0.0
0.0
	5544
	0.0
0.0
	0
	8602.0
0.07648261758691206
	0
	8602.0
0.07648261758691206


#### Comment

### 3. Interpolated Model

#### Comment

## Text Generation

### 1. Tweet Functionality

#### Comment

### 2. AMLO model

#### Comment

### 3. Evaluation with custom phrases

#### Comment

### 4. More evaluation 

#### Comment

## El ahorcado

### 1. Norvig's Hangman

#### Comment

### 2. Follow-up

#### Comment