<a href="https://colab.research.google.com/github/Jorgecuenca1/Tps_ProcesamientoLenguajeNatural/blob/main/clase1_tp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# 1. Obtener el vocabulario del corpus
def get_vocab(corpus):
    vocab = set()
    for doc in corpus:
        words = doc.split()
        vocab.update(words)
    return list(vocab)

corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])
vocab = get_vocab(corpus)
print(vocab)
# 2. One-hot encoding
def one_hot_encode(corpus, vocab):
    vectorized_docs = []
    for doc in corpus:
        words = doc.split()
        vector = [1 if term in words else 0 for term in vocab]
        vectorized_docs.append(vector)
    return np.array(vectorized_docs)

one_hot_encoded = one_hot_encode(corpus, vocab)
print(one_hot_encoded)
# 3. Vectores de frecuencia
def term_frequency(corpus, vocab):
    vectorized_docs = []
    for doc in corpus:
        words = doc.split()
        vector = [words.count(term) for term in vocab]
        vectorized_docs.append(vector)
    return np.array(vectorized_docs)

term_frequencies = term_frequency(corpus, vocab)
print(term_frequencies)
# 4. TF-IDF
def tf_idf(corpus, vocab):
    N = len(corpus)
    term_frequencies = term_frequency(corpus, vocab)
    vectorized_docs = []
    for doc_tf in term_frequencies:
        df = np.count_nonzero(doc_tf)
        idf = np.log(N / (df + 1))
        tf_idf_vector = np.multiply(doc_tf, idf)
        vectorized_docs.append(tf_idf_vector)
    return np.array(vectorized_docs)

tf_idf_vectors = tf_idf(corpus, vocab)
print(tf_idf_vectors)
# 5. Comparación de documentos
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

def compare_docs(corpus, index):
    tf_idf_vectors = tf_idf(corpus, vocab)
    similarities = [cosine_similarity(tf_idf_vectors[index], vec) for vec in tf_idf_vectors]
    sorted_docs = np.argsort(similarities)[::-1]
    return sorted_docs

doc_similarities = compare_docs(corpus, 0)
print(doc_similarities)


['el', 'hoy', 'de', 'gracias', 'muchas', 'que', 'martes', 'dia', 'es']
[[0 1 0 0 0 1 0 1 1]
 [1 1 1 0 0 0 1 1 1]
 [0 0 0 1 1 0 1 0 0]]
[[0 1 0 0 0 1 0 1 1]
 [1 1 1 0 0 0 2 1 1]
 [0 0 0 1 1 0 1 0 0]]
[[-0.         -0.51082562 -0.         -0.         -0.         -0.51082562
  -0.         -0.51082562 -0.51082562]
 [-0.84729786 -0.84729786 -0.84729786 -0.         -0.         -0.
  -1.69459572 -0.84729786 -0.84729786]
 [-0.         -0.         -0.         -0.28768207 -0.28768207 -0.
  -0.28768207 -0.         -0.        ]]
[0 1 2]
