# Actividad 4.3. Similitud en textos mediante TF-IDF y Cadenas de Markov
---
**Por: Ian Joab Padron Corona - A01708940**

In [None]:
import pandas as pd
import numpy as np
import math
import re

from numpy.typing import ArrayLike

In [None]:
def df_file(file: str) -> pd.DataFrame:
    """
    Reads a CSV file and returns a pandas DataFrame.

    Parameters
    ----------
    file : `str` Name of the CSV file to read.

    Returns
    -------
    data : `DataFrame` Containing the data from the CSV file.
    """
    file = f'./content/{file}'
    data = pd.read_csv(file, header=0, encoding='utf-8')
    return data

In [None]:
def clean_text(text: str) -> str:
    """
    Cleans a text by removing special characters and converting it to lowercase.

    Parameters
    ----------
    text : `str` Text to clean.

    Returns
    -------
    Cleaned text.
    """
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'[^a-zA-Z0-9\\s]', ' ', text)
    return text.lower()

In [None]:
def array_words(text: str) -> list:
    """
    Splits a text into a list of words.

    Parameters
    ----------
    text : `str` Text to split into words.

    Returns
    -------
    List of words.
    """
    return text.split()

In [None]:
def uniqueWords(q1: str, q2: str) -> list:
    """
    Filters unique words from two text strings.

    Parameters
    ----------
    q1 : `str` First text string.
    q2 : `str` Second text string.

    Returns
    -------
    List of unique words.
    """
    words_q1 = array_words(q1)
    words_q2 = array_words(q2)
    unique_words = set(words_q1 + words_q2)
    return list(unique_words)

In [None]:
def countWords(q: str, wordsq1q2: list) -> list:
    """
    Counts the frequency of words in a text string.

    Parameters
    ----------
    q : `str` Text string.
    wordsq1q2 : `list` List of words to count.

    Returns
    -------
    List of word frequencies in the text string.
    """
    q_words = array_words(q)
    wordsCount = []
    for word in wordsq1q2:
        wordsCount.append(q_words.count(word) / len(q_words))
    return wordsCount

In [None]:
def idf(words: list, q1: str, q2: str) -> list:
    """
    Calculates the IDF of a list of words in two text strings.

    Parameters
    ----------
    words : `list` List of words.
    q1 : `str` First text string.
    q2 : `str` Second text string.

    Returns
    -------
    List of IDF values for each word.
    """
    idfs = []
    for word in words:
        docs_count = 0
        if word in array_words(q1):
            docs_count += 1
        if word in array_words(q2):
            docs_count += 1
        idf = ((math.log(2 / (docs_count + 1))) + 1)
        idfs.append(idf)
    return idfs

In [None]:
def tfByIDF(tfQ: list, idf: list) -> list:
    """
    Calculates the TF-IDF of a list of word frequencies and their IDF values.

    Parameters
    ----------
    tfQ : `list` List of word frequencies.
    idf : `list` List of IDF values for each word.

    Returns
    -------
    List of TF-IDF values for each word.
    """
    return [tf * idf_val for tf, idf_val in zip(tfQ, idf)]

In [None]:
def cosine_similarity(tf_idf_q1: ArrayLike, tf_idf_q2: ArrayLike) -> float:
    """
    Calculates the cosine similarity between two TF-IDF vectors.

    Parameters
    ----------
    tf_idf_q1 : `list` TF-IDF vector for the first text.
    tf_idf_q2 : `list` TF-IDF vector for the second text.

    Returns
    -------
    Cosine similarity value.
    """
    dot_product = np.dot(tf_idf_q1, tf_idf_q2)
    norm_q1 = np.linalg.norm(tf_idf_q1)
    norm_q2 = np.linalg.norm(tf_idf_q2)
    if norm_q1 == 0 or norm_q2 == 0:
        return 0
    else:
        return (dot_product / (norm_q1 * norm_q2))

In [None]:
def word_follow(text: str) -> dict:
    """
    Lists the words that follow each word in a text.

    Parameters
    ----------
    text : `str` Text to analyze.

    Returns
    -------
    Dictionary with words and their following words.
    """
    arr_text = array_words(text)
    follows = {}
    for i in range(1, len(arr_text)):
        if arr_text[i - 1] not in follows:
            follows[arr_text[i - 1]] = [arr_text[i]]
        else:
            follows[arr_text[i - 1]].append(arr_text[i])
    return follows

In [None]:
def word_follow_matrix(bow: list, follows: dict) -> pd.DataFrame:
    """
    Creates a matrix of words that follow other words.

    Parameters
    ----------
    bow : `list` List of words (Bag of Words).
    follows : `dict` Dictionary with words and their following words.

    Returns
    -------
    Matrix of words that follow other words.
    """
    matrix = []
    for word in bow:
        row = [0] * len(bow)
        if word in follows:
            for word_follow in follows[word]:
                row[bow.index(word_follow)] += 1
            row = [i / len(follows[word]) for i in row]
        matrix.append(row)
    return pd.DataFrame(matrix, index=bow, columns=bow)

In [None]:
def cosine_similarity_matrix(prod_int: float, m1: ArrayLike, m2: ArrayLike) -> float:
    """
    Calculates the cosine similarity between two matrices.

    Parameters
    ----------
    prod_int : `float` Inner product of the matrices.
    m1 : `ArrayLike` First matrix.
    m2 : `ArrayLike` Second matrix.

    Returns
    -------
    Cosine similarity value between the matrices.
    """
    norm_m1 = np.linalg.norm(m1)
    norm_m2 = np.linalg.norm(m2)
    if norm_m1 == 0 or norm_m2 == 0:
        return 0
    else:
        return (prod_int / (norm_m1 * norm_m2))

In [None]:
# Definir el nombre del archivo
archivo = 'questions.csv'

# Llamar la funcion df_file para obtener el DataFrame
data = df_file(archivo).head(1)

# Aplicar la funcion regex a la columna 'question1' y 'question2 del DataFrame
data['question1'] = data['question1'].apply(clean_text)
data['question2'] = data['question2'].apply(clean_text)

# Crear una columna que contenga un array con todas las palabras de question1 y question2, sin duplicados
data['BoWQ1Q2'] = data.apply(lambda x: uniqueWords(x['question1'], x['question2']), axis=1)

# Crear una columna que contenga un array con la cantidad de veces que aparece cada palabra de words en q1,q2
data['TF_q1'] = data.apply(lambda x: countWords(x['question1'], x['words']), axis=1)
data['TF_q2'] = data.apply(lambda x: countWords(x['question2'], x['words']), axis=1)

# Crear una columna que contenga el array de IDF's de cada palabra en words
data['vecIDF'] = data.apply(lambda x: idf(x['BoWQ1Q2'], x['question1'], x['question2']), axis=1)

# Crear una columna que contenga el array de TF-IDF's de cada palabra en words
data['q1_vecTFIDF'] = data.apply(lambda x: tfByIDF(x['TF_q1'], x['vecIDF']), axis=1)
data['q2_vecTFIDF'] = data.apply(lambda x: tfByIDF(x['TF_q2'], x['vecIDF']), axis=1)

# Crear una columna que contenga el calculo del coseno entre los TF-IDF de q1 y q2
data['cos_TFID'] = data.apply(lambda x: cosine_similarity(x['q1_vecTFIDF'], x['q2_vecTFIDF']), axis=1)


# Crear una columna con una diccionario de palabras que siguen a cada palabra en q1
data['q1_wordsFollow'] = data.apply(lambda x: word_follow(x['question1']), axis=1)
data['q2_wordsFollow'] = data.apply(lambda x: word_follow(x['question2']), axis=1)

# Crear una columna con una matriz de palabras que siguen a cada palabra en q1,q2
data['q1_markovMatrix'] = data.apply(lambda x: word_follow_matrix(x['BoWQ1Q2'], x['q1_wordsFollow']), axis=1)
data['q2_markovMatrix'] = data.apply(lambda x: word_follow_matrix(x['BoWQ1Q2'], x['q2_wordsFollow']), axis=1)

# Transponer la matriz de q2
data['q2_markovMatrix'] = data.apply(lambda x: x['q2_markovMatrix'].T, axis=1)

# Multiplicar las matrices de q1 y q2
data['dotMatrix'] = data.apply(lambda x: x['q1_markovMatrix'].dot(x['q2_markovMatrix']), axis=1)

# Aplicar traza a la matriz de q1 y q2
data['trace_matrix'] = data.apply(lambda x: np.trace(x['dotMatrix']), axis=1)

# Crear una columna con el calculo del coseno entre las matrices de q1, q2 y su traza
data['cosine_distance_matrix'] = data.apply(lambda x: cosine_similarity_matrix(x['trace_matrix'], 
                                                                               x['q1_markovMatrix'], 
                                                                               x['q2_markovMatrix']), 
                                                                               axis=1)


# Guardar el DataFrame en un archivo CSV
data.to_csv('./outputs/tf_IDF.csv', index=False)

In [15]:
# Testing the word_follow function
text1 = "b a c a b d b a b"
text2 = "a c b a d c"
result1 = word_follow(text1)
result2 = word_follow(text2)
print(result1)
print(result2)

# Testing the word_follow_matrix function
bow1 = list(set(array_words(text1))) 
follows1 = word_follow(text1)
matrix1 = word_follow_matrix(bow1, follows1)
print(matrix1)

# Testing the word_follow_matrix function
bow2 = list(set(array_words(text2))) 
follows2 = word_follow(text2)
matrix2 = word_follow_matrix(bow2, follows2)
print(matrix2)

{'b': ['a', 'd', 'a'], 'a': ['c', 'b', 'b'], 'c': ['a'], 'd': ['b']}
{'a': ['c', 'd'], 'c': ['b'], 'b': ['a'], 'd': ['c']}
          a         b         d         c
a  0.000000  0.666667  0.000000  0.333333
b  0.666667  0.000000  0.333333  0.000000
d  0.000000  1.000000  0.000000  0.000000
c  1.000000  0.000000  0.000000  0.000000
     a    b    d    c
a  0.0  0.0  0.5  0.5
b  1.0  0.0  0.0  0.0
d  0.0  0.0  0.0  1.0
c  0.0  1.0  0.0  0.0
