# Examen Parcial

## Pregunta 1

### Parte 1

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
import collections
from typing import List, Tuple, Dict


#Corpus de entrenamiento no tiene la palabra "is" dado que
#el vocabulario especificado en el enunciado no lo contiene
#y la probabilidad de ngramas conteniendo dicha palabra va a ser
#interpolada
corpus = [
    "all models are wrong",
    "a model wrong",
    "some models are useful"
]

test = [
    "all models are wrong",
    "a model is wrong",
    "some models are useful"
]


tokens = [text.split() for text in corpus]
# print(tokens)

# Implementación de modelos N-grama
class NGramModel:
    def __init__(self, n: int):
        self.n = n
        self.ngram_counts = collections.Counter()
        self.context_counts = collections.Counter()
        self.vocab = set()
        self.total_ngrams = 0

    def train(self, corpus: List[List[str]]):
        for document in corpus:
            if(self.n == 1):
                tokens = ['<s>'] + document + ['</s>']
            else:
                tokens = ['<s>'] * (self.n - 1) + document + ['</s>']
            # print(tokens
            self.vocab.update(tokens)
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i + self.n])
                context = tuple(tokens[i:i + self.n - 1])
                self.ngram_counts[ngram] += 1
                self.context_counts[context] += 1
                self.total_ngrams += 1

    def get_ngram_prob(self, ngram: Tuple[str, ...], k=0) -> float:
        count = self.ngram_counts.get(ngram, 0)
        context = ngram[:-1]
        context_count = self.context_counts.get(context, 0)
        if context_count == 0:
            return 0.0
        else:
            return (count + k) / (context_count + k*self.total_ngrams)

    def get_sentence_probability(self, sentence: List[str], k=0) -> float:
        tokens = ['<s>'] * (self.n - 1) + sentence + ['</s>']
        probability = 1.0
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            prob = self.get_ngram_prob(ngram, k)
            print(f"P({ngram[-1]}|{ngram[:-1]}) = {prob}")
            if prob > 0:
                probability *= prob
            else:
                # Asignamos una pequeña probabilidad para evitar cero
                probability *= 1e-6
        return probability

bigram_model = NGramModel(n=2)
bigram_model.train(tokens)
print(bigram_model.vocab)

print("a) Probabilidades de bigramas SIN SUAVIZADO")
for text in test:
    prob = bigram_model.get_sentence_probability(text.split())
    print(f"Texto: {text}, P: {prob}")
print("-"*10)
print("b) Probabilidades de bigramas CON SUAVIZADO ADD-ONE")
for text in test:
    prob = bigram_model.get_sentence_probability(text.split(), 1)
    print(f"Texto: {text}, P: {prob}")

print("-"*10)
print("c) Probabilidades de bigramas CON SUAVIZADO ADD-K")
for k in [0.05, 0.15]:
  print(f"K: {k}")
  for text in test:
    prob = bigram_model.get_sentence_probability(text.split(), k)
    print(f"-Texto: {text}, P: {prob}")





{'a', 'useful', 'model', '<s>', '</s>', 'models', 'wrong', 'some', 'are', 'all'}
a) Probabilidades de bigramas SIN SUAVIZADO
P(all|('<s>',)) = 0.3333333333333333
P(models|('all',)) = 1.0
P(are|('models',)) = 1.0
P(wrong|('are',)) = 0.5
P(</s>|('wrong',)) = 1.0
Texto: all models are wrong, P: 0.16666666666666666
P(a|('<s>',)) = 0.3333333333333333
P(model|('a',)) = 1.0
P(is|('model',)) = 0.0
P(wrong|('is',)) = 0.0
P(</s>|('wrong',)) = 1.0
Texto: a model is wrong, P: 3.333333333333333e-13
P(some|('<s>',)) = 0.3333333333333333
P(models|('some',)) = 1.0
P(are|('models',)) = 1.0
P(useful|('are',)) = 0.5
P(</s>|('useful',)) = 1.0
Texto: some models are useful, P: 0.16666666666666666
----------
b) Probabilidades de bigramas CON SUAVIZADO ADD-ONE
P(all|('<s>',)) = 0.11764705882352941
P(models|('all',)) = 0.13333333333333333
P(are|('models',)) = 0.1875
P(wrong|('are',)) = 0.125
P(</s>|('wrong',)) = 0.1875
Texto: all models are wrong, P: 6.893382352941176e-05
P(a|('<s>',)) = 0.11764705882352941
P

In [79]:
corpus = [
    "all models are wrong",
    "a model wrong",
    "some models are useful"
]

test = [
    "all models are wrong",
    "a model is wrong",
    "some models are useful"
]
tokens = [text.split() for text in corpus]

class BackoffNGramModel(NGramModel):
    def __init__(self, n: int, models: List[NGramModel]):
        super().__init__(n)
        self.models = models  # Lista de modelos de diferentes órdenes, ordenados de mayor a menor
        # Actualizamos self.vocab con la unión de los vocabularios de los modelos
        self.vocab = set()
        for model in self.models:
            self.vocab.update(model.vocab)

    def get_ngram_prob(self, ngram: Tuple[str, ...]) -> float:
        for model in self.models:
            ngram_adjusted = ngram[-model.n:]
            prob = model.get_ngram_prob(ngram_adjusted)
            if prob > 0:
                return prob
        # Si ningún modelo tiene el n-grama, asignamos una pequeña probabilidad
        return 1e-6

class StupidBackoffNGramModel(NGramModel):
    def __init__(self, n: int, models: List[NGramModel], alpha: float = 0.4):
        super().__init__(n)
        self.models = models  # Lista de modelos de diferentes órdenes, ordenados de mayor a menor
        self.alpha = alpha    # Factor de escala fijo
        # Actualizamos self.vocab con la unión de los vocabularios de los modelos
        self.vocab = set()
        for model in self.models:
            self.vocab.update(model.vocab)

    def get_ngram_prob(self, ngram: Tuple[str, ...]) -> float:
        for i, model in enumerate(self.models):
            ngram_adjusted = ngram[-model.n:]
            prob = model.get_ngram_prob(ngram_adjusted)
            if prob > 0:
                return (self.alpha ** i) * prob
        # Si ningún modelo tiene el n-grama, asignamos una pequeña probabilidad
        return (self.alpha ** len(self.models)) * (1.0 / len(self.vocab))


unigram_model = NGramModel(n=1)
unigram_model.train(tokens)

bigram_model = NGramModel(n=2)
bigram_model.train(tokens)

backoff_model = BackoffNGramModel(n=2, models=[bigram_model, unigram_model])
stupid_backoff_model = StupidBackoffNGramModel(n=2, models=[bigram_model, unigram_model], alpha=0.4)


print("d.1) Probabilidades con backoff")
for text in test:
    tokens_test = ['<s>'] * (2 - 1) + text.split() + ['</s>']
    print(f"Texto: {tokens_test}")
    for i in range(len(tokens_test) - 2 + 1):
      ngram = tuple(tokens_test[i:i + 2])
      # print(ngram)
      prob = backoff_model.get_ngram_prob(ngram)
      print(f"N-grama: {ngram}, P: {prob}")

print("\nd.2) Probabilidades con stupid backoff")
for text in test:
    tokens_test = ['<s>'] * (2 - 1) + text.split() + ['</s>']
    print(f"Texto: {tokens_test}")
    for i in range(len(tokens_test) - 2 + 1):
      ngram = tuple(tokens_test[i:i + 2])
      # print(ngram)
      prob = stupid_backoff_model.get_ngram_prob(ngram)
      print(f"N-grama: {ngram}, P: {prob}")

d.1) Probabilidades con backoff
Texto: ['<s>', 'all', 'models', 'are', 'wrong', '</s>']
N-grama: ('<s>', 'all'), P: 0.3333333333333333
N-grama: ('all', 'models'), P: 1.0
N-grama: ('models', 'are'), P: 1.0
N-grama: ('are', 'wrong'), P: 0.5
N-grama: ('wrong', '</s>'), P: 1.0
Texto: ['<s>', 'a', 'model', 'is', 'wrong', '</s>']
N-grama: ('<s>', 'a'), P: 0.3333333333333333
N-grama: ('a', 'model'), P: 1.0
N-grama: ('model', 'is'), P: 1e-06
N-grama: ('is', 'wrong'), P: 0.11764705882352941
N-grama: ('wrong', '</s>'), P: 1.0
Texto: ['<s>', 'some', 'models', 'are', 'useful', '</s>']
N-grama: ('<s>', 'some'), P: 0.3333333333333333
N-grama: ('some', 'models'), P: 1.0
N-grama: ('models', 'are'), P: 1.0
N-grama: ('are', 'useful'), P: 0.5
N-grama: ('useful', '</s>'), P: 1.0

d.2) Probabilidades con stupid backoff
Texto: ['<s>', 'all', 'models', 'are', 'wrong', '</s>']
N-grama: ('<s>', 'all'), P: 0.3333333333333333
N-grama: ('all', 'models'), P: 1.0
N-grama: ('models', 'are'), P: 1.0
N-grama: ('are', 

### Parte 2

In [87]:
import numpy as np
import math

def calculate_NC(ngram_counts: Dict[Tuple[str, ...], int]) -> Dict[int, int]:
    count_of_counts = collections.Counter()
    for count in ngram_counts.values():
        count_of_counts[count] += 1
    return count_of_counts

def sort_NC(NC: Dict[int, int]) -> Tuple[np.ndarray, np.ndarray]:
    counts = np.array(list(NC.keys()))
    frequencies = np.array([NC[count] for count in counts])
    sorted_indices = np.argsort(counts)
    return counts[sorted_indices], frequencies[sorted_indices]


# Suavizado Good-Turing
def good_turing_discounting(ngram_counts: Dict[Tuple[str, ...], int]) -> Dict[Tuple[str, ...], float]:
    # Calculamos N(C)
    NC = calculate_NC(ngram_counts)
    counts, frequencies = sort_NC(NC)

    # Ajuste de conteos
    total_ngrams = sum(ngram_counts.values())
    max_count = max(counts)
    adjusted_counts = {}

    for ngram, count in ngram_counts.items():
        if count < max_count:
            Nc = NC[count]
            Nc1 = NC.get(count + 1, 0)
            if Nc > 0:
                C_star = (count + 1) * (Nc1 / Nc)
                adjusted_counts[ngram] = C_star
            else:
                adjusted_counts[ngram] = count
        else:
            adjusted_counts[ngram] = count  # Para conteos máximos, no ajustamos
    return adjusted_counts

adjusted_bigram_counts = good_turing_discounting(bigram_model.ngram_counts)

#Cálculo de probabilidades ajustadas
def calculate_probabilities(adjusted_counts: Dict[Tuple[str, ...], float], n_minus1_counts: Dict[Tuple[str, ...], int]) -> Dict[Tuple[str, ...], float]:
    probabilities = {}
    for ngram, adjusted_count in adjusted_counts.items():
        context = ngram[:-1]
        context_count = n_minus1_counts.get(context, sum(n_minus1_counts.values()))
        probability = adjusted_count / context_count if context_count > 0 else 0.0
        probabilities[ngram] = probability
    return probabilities

def probability_of_unseen(NC: Dict[int, int], total_ngrams: int) -> float:
    N1 = NC.get(1, 0)
    return N1 / total_ngrams if total_ngrams > 0 else 0.0

def sentence_probability(sentence: str, bigram_probabilities: Dict[Tuple[str, str], float], P_unseen: float) -> float:
    tokens = ['<s>'] + sentence.lower().split() + ['</s>']
    probability_log_sum = 0.0
    for i in range(len(tokens) - 1):
        bigram = (tokens[i], tokens[i+1])
        prob = bigram_probabilities.get(bigram, P_unseen)
        probability_log_sum += math.log(prob) if prob > 0 else float('-inf')
    return math.exp(probability_log_sum)

# Calculamos las probabilidades ajustadas para bigramas

NC_ngram = calculate_NC(unigram_model.ngram_counts)
print("Conteos de unigramas")
for ngram, count in unigram_model.ngram_counts.items():
    print(f"{ngram}: {count}")

print("Conteo de conteos")
for count, freq in NC_ngram.items():
    print(f"r = {count}: {freq}")

adjusted_counts = good_turing_discounting(unigram_model.ngram_counts)
print("Conteos de unigramas ajustados")
for ngram, count in adjusted_counts.items():
    print(f"{ngram}: {count}")

probabilities = calculate_probabilities(adjusted_counts, unigram_model.ngram_counts)
print("Probabilidades de unigramas ajustados")
for ngram, count in probabilities.items():
    print(f"{ngram}: {count}")

total_ngrams = sum(bigram_model.ngram_counts.values())
P_unseen_ngram = probability_of_unseen(NC_ngram, total_ngrams)

for test_sentence in test:
    prob = sentence_probability(test_sentence, probabilities, P_unseen_ngram)
    print(f"La probabilidad de la oración '{test_sentence}' es: {prob}")


Conteos de unigramas
('<s>',): 3
('all',): 1
('models',): 2
('are',): 2
('wrong',): 2
('</s>',): 3
('a',): 1
('model',): 1
('some',): 1
('useful',): 1
Conteo de conteos
r = 3: 2
r = 1: 5
r = 2: 3
Conteos de unigramas ajustados
('<s>',): 3
('all',): 1.2
('models',): 2.0
('are',): 2.0
('wrong',): 2.0
('</s>',): 3
('a',): 1.2
('model',): 1.2
('some',): 1.2
('useful',): 1.2
Probabilidades de unigramas ajustados
('<s>',): 0.17647058823529413
('all',): 0.07058823529411765
('models',): 0.11764705882352941
('are',): 0.11764705882352941
('wrong',): 0.11764705882352941
('</s>',): 0.17647058823529413
('a',): 0.07058823529411765
('model',): 0.07058823529411765
('some',): 0.07058823529411765
('useful',): 0.07058823529411765
La probabilidad de la oración 'all models are wrong' es: 0.00581045100255846
La probabilidad de la oración 'a model is wrong' es: 0.00581045100255846
La probabilidad de la oración 'some models are useful' es: 0.00581045100255846


In [107]:
print("f) Probabilidades para r=3")
probs = []
for ngram, count in probabilities.items():
    if(unigram_model.ngram_counts[ngram] != 3):
        probs.append(count)
        print(f"{ngram}: {count}")
    else:
        probs.append(unigram_model.get_ngram_prob(ngram))
        print(f"{ngram}: {unigram_model.get_ngram_prob(ngram)}")
print(f"Suma de PROB: {np.sum(probs)}")


print("\nh) Normalizar las probabilidades")
for ngram, count in probabilities.items():
    print(f"{ngram}: {count/np.sum(probs)}")


f) Probabilidades para r=3
('<s>',): 0.17647058823529413
('all',): 0.07058823529411765
('models',): 0.11764705882352941
('are',): 0.11764705882352941
('wrong',): 0.11764705882352941
('</s>',): 0.17647058823529413
('a',): 0.07058823529411765
('model',): 0.07058823529411765
('some',): 0.07058823529411765
('useful',): 0.07058823529411765
Suma de PROB: 1.0588235294117647

h) Normalizar las probabilidades
('<s>',): 0.16666666666666669
('all',): 0.06666666666666667
('models',): 0.1111111111111111
('are',): 0.1111111111111111
('wrong',): 0.1111111111111111
('</s>',): 0.16666666666666669
('a',): 0.06666666666666667
('model',): 0.06666666666666667
('some',): 0.06666666666666667
('useful',): 0.06666666666666667


## Pregunta 2

In [128]:
class BrownClustering:
  def __init__(self, corpus):
      self.corpus = corpus
      self.word_count = collections.Counter()
      self.word_count.update(self.corpus.lower().split())
      self.classes = []

  def train(self):
      self.classes = [[item] for item in self.word_count.items()]
      print(len(self.classes))
      print(self.word_count)
      print(self.classes)


  def get_class(self, word):
      for i in range(len(self.classes)):
          if word in self.classes[i]:
              return i
      return None



  def prob_word_given_class(self, word):
      return self.word_count[word]/len(self.classes[self.get_class(word)])

  def prob_class(self, c):
      freq = np.sum([w[1] for w in self.classes[c]])
      return freq/len(self.word_count)

  def prob_sequence(self, list_words):
      prob = self.prob_class(self.get_class(list_words[0]))
      prob *= self.prob_word_given_class(list_words[1])
      for i in range(len(list_words[2:])):
          prob *= self.prob_class(self.get_class(list_words[i]))*self.prob_class(self.get_class(list_words[i-1]))
          prob *= self.prob_word_given_class(list_words[i])
      return prob

  def mutual_info(self):



corpus = "Mi libro, luna de pluton, ya esta disponible en todas las librerias de habla hispana"
# corpus = "Los lenguajes son una parte crucial de la inteligencia humana e importantes para la comunicación humana. Al investigar la comprensión automática y la generación de lenguajes humanos, el procesamiento del lenguaje natural (NLP) ha sido un subcampo central de la investigación en inteligencia artificial. Desde la década de 1950, la tecnología de NLP ha recibido una atención continua por parte de la investigación y se han logrado grandes avances. Hoy en día, la tecnología NLP se está convirtiendo en una parte indispensable de nuestro negocio y de nuestra vida diaria. Por ejemplo, los motores de búsqueda procesan automáticamente billones de documentos a través de Internet, obtienen conocimientos de ellos y responden a las consultas de los usuarios basándose en su comprensión. Los minoristas en línea procesan millones de descripciones de productos y comentarios de usuarios para recomendar el producto más adecuado según la búsqueda de un usuario. Los sistemas de diálogo automático y los sistemas de traducción son cada vez más utilizados para facilitar la comunicación. En los negocios, los motores de análisis de texto han estado reemplazando el trabajo manual en el análisis de grandes cantidades de documentos para una mejor toma de decisiones."

brown_clustering = BrownClustering(corpus)
brown_clustering.train()

14
Counter({'de': 2, 'mi': 1, 'libro,': 1, 'luna': 1, 'pluton,': 1, 'ya': 1, 'esta': 1, 'disponible': 1, 'en': 1, 'todas': 1, 'las': 1, 'librerias': 1, 'habla': 1, 'hispana': 1})
[[('mi', 1)], [('libro,', 1)], [('luna', 1)], [('de', 2)], [('pluton,', 1)], [('ya', 1)], [('esta', 1)], [('disponible', 1)], [('en', 1)], [('todas', 1)], [('las', 1)], [('librerias', 1)], [('habla', 1)], [('hispana', 1)]]
