# Actividad 4.4. Comparación entre técnicas de semejanza
---
Instituto Tecnológico y de Estudios Superiores de Monterrey

Campus Querétaro

TC3002B Desarrollo de aplicaciones avanzadas de ciencias computacionales

Módulo 4 Resultados de la comparación entre técnicas de semejanza.

Profesores:

Manuel Iván Casillas del Llano


Presenta:

**Ian Joab Padron Corona - A01708940**

Fecha:

Miércoles, 23 de abril del 2025

<a href="https://colab.research.google.com/github/Ian326/TC3002B_M4/blob/main/A01708940_Actividad%204.4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Librerias

In [None]:
import pandas as pd
import numpy as np
import math
import re

from numpy.typing import ArrayLike

## Funciones

In [None]:
def txt_to_series(file: str) -> pd.Series:
    """
    Reads a TXT file and returns a pandas Series where each line is an entry.

    Parameters
    ----------
    file : `str` Name of the TXT file to read.

    Returns
    -------
    series : `pd.Series` Containing the lines of the TXT file as entries.
    """
    file_path = f'./content/{file}'
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    series = pd.Series([content.strip()])
    return series

In [None]:
def df_file(file: str) -> pd.DataFrame:
    """
    Reads a CSV file and returns a pandas DataFrame.

    Parameters
    ----------
    file : `str` Name of the CSV file to read.

    Returns
    -------
    data : `DataFrame` Containing the data from the CSV file.
    """
    file = f'./content/{file}'
    data = pd.read_csv(file, header=0, encoding='utf-8')
    return data

In [None]:
def clean_text(text: str) -> str:
    """
    Cleans a text by removing special characters and converting it to lowercase.

    Parameters
    ----------
    text : `str` Text to clean.

    Returns
    -------
    Cleaned text.
    """
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower()

In [None]:
def array_words(text: str) -> list:
    """
    Splits a text into a list of words.

    Parameters
    ----------
    text : `str` Text to split into words.

    Returns
    -------
    List of words.
    """
    return text.split()

In [None]:
def uniqueWords(q1: str, q2: str) -> list:
    """
    Filters unique words from two text strings.

    Parameters
    ----------
    q1 : `str` First text string.
    q2 : `str` Second text string.

    Returns
    -------
    List of unique words.
    """
    words_q1 = array_words(q1)
    words_q2 = array_words(q2)
    unique_words = set(words_q1 + words_q2)
    return list(unique_words)

In [None]:
def countWords(q: str, wordsq1q2: list, call:str) -> list:
    """
    Counts the frequency of words in a text string.

    Parameters
    ----------
    q : `str` Text string.
    wordsq1q2 : `list` List of words to count.
    call : `str` Use case for the function. 
        `BoW` for Act4.2.

        `TF` for Act4.3.
    
    Returns
    -------
    List of word frequencies in the text string.
    """
    q_words = array_words(q)
    wordsCount = []
    for word in wordsq1q2:
        if call == 'BoW':
            wordsCount.append(q_words.count(word))
        elif call == 'TF':
            wordsCount.append(q_words.count(word) / len(q_words) if len(q_words) > 0 else 0)
        else:
            raise ValueError("Invalid call type. Use 'BoW' or 'TF'.")
    return wordsCount

In [None]:
def idf(words: list, q1: str, q2: str) -> list:
    """
    Calculates the IDF of a list of words in two text strings.

    Parameters
    ----------
    words : `list` List of words.
    q1 : `str` First text string.
    q2 : `str` Second text string.

    Returns
    -------
    List of IDF values for each word.
    """
    idfs = []
    for word in words:
        docs_count = 0
        if word in array_words(q1):
            docs_count += 1
        if word in array_words(q2):
            docs_count += 1
        idf = ((math.log(2 / (docs_count + 1))) + 1)
        idfs.append(idf)
    return idfs

In [None]:
def tfByIDF(tfQ: list, idf: list) -> list:
    """
    Calculates the TF-IDF of a list of word frequencies and their IDF values.

    Parameters
    ----------
    tfQ : `list` List of word frequencies.
    idf : `list` List of IDF values for each word.

    Returns
    -------
    List of TF-IDF values for each word.
    """
    return [tf * idf_val for tf, idf_val in zip(tfQ, idf)]

In [None]:
def cosine_similarity(tf_idf_q1: ArrayLike, tf_idf_q2: ArrayLike) -> float:
    """
    Calculates the cosine similarity between two TF-IDF vectors.

    Parameters
    ----------
    tf_idf_q1 : `list` TF-IDF vector for the first text.
    tf_idf_q2 : `list` TF-IDF vector for the second text.

    Returns
    -------
    Cosine similarity value.
    """
    dot_product = np.dot(tf_idf_q1, tf_idf_q2)
    norm_q1 = np.linalg.norm(tf_idf_q1)
    norm_q2 = np.linalg.norm(tf_idf_q2)
    if norm_q1 == 0 or norm_q2 == 0:
        return 0
    else:
        return (dot_product / (norm_q1 * norm_q2))

In [None]:
def word_follow(text: str) -> dict:
    """
    Lists the words that follow each word in a text.

    Parameters
    ----------
    text : `str` Text to analyze.

    Returns
    -------
    Dictionary with words and their following words.
    """
    arr_text = array_words(text)
    follows = {}
    for i in range(1, len(arr_text)):
        if arr_text[i - 1] not in follows:
            follows[arr_text[i - 1]] = [arr_text[i]]
        else:
            follows[arr_text[i - 1]].append(arr_text[i])
    return follows

In [None]:
def word_follow_matrix(bow: list, follows: dict) -> pd.DataFrame:
    """
    Creates a matrix of words that follow other words.

    Parameters
    ----------
    bow : `list` List of words (Bag of Words).
    follows : `dict` Dictionary with words and their following words.

    Returns
    -------
    Matrix of words that follow other words.
    """
    matrix = []
    for word in bow:
        row = [0] * len(bow)
        if word in follows:
            for word_follow in follows[word]:
                row[bow.index(word_follow)] += 1
            row = [i / len(follows[word]) for i in row]
        matrix.append(row)
    return pd.DataFrame(matrix, index=bow, columns=bow)

In [None]:
def cosine_similarity_matrix(prod_int: float, m1: ArrayLike, m2: ArrayLike) -> float:
    """
    Calculates the cosine similarity between two matrices.

    Parameters
    ----------
    prod_int : `float` Inner product of the matrices.
    m1 : `ArrayLike` First matrix.
    m2 : `ArrayLike` Second matrix.

    Returns
    -------
    Cosine similarity value between the matrices.
    """
    norm_m1 = np.linalg.norm(m1)
    norm_m2 = np.linalg.norm(m2)
    if norm_m1 == 0 or norm_m2 == 0:
        return 0
    else:
        return (prod_int / (norm_m1 * norm_m2))

In [None]:
def txts_to_df(original:str, similar_files:list) -> pd.DataFrame:
    """
    Creates a DataFrame with the original text and similar texts.

    Parameters
    ----------
    original : `str` Original text.
    similar : `list` List of similar texts.

    Returns
    -------
    DataFrame with the original text and similar texts.
    """
    original_series = txt_to_series(original)

    rows = []

    for file in similar_files:
        file_series = txt_to_series(file)
        sim_grade = file.split('_')[0]  # Extracts 'high', 'low', or 'moderate'
        # Create a temporary DataFrame with 'original' and 'edited' columns
        temp_df = pd.DataFrame({
            'original': original_series,
            'edited': file_series,
            'simGrade': sim_grade
        })

        # Append the temporary DataFrame to the rows list
        rows.append(temp_df)

    return pd.concat(rows, ignore_index=True)
    

In [None]:
def check_prediction(expected: str, predicted: float) -> bool:
    """
    Checks if the predicted value is within the expected range.

    Parameters
    ----------
    expected : `str` Expected range in the format 'low', 'moderate', or 'high'.
    predicted : `float` Predicted value.

    Returns
    -------
    bool : True if the prediction is correct, False otherwise.
    """
    if expected == 'low' and predicted >= 0 and predicted < 0.45:
        return True
    elif expected == 'moderate' and predicted >= 45 and predicted < 0.85:
        return True
    elif expected == 'high' and predicted >= 0.85:
        return True
    else:
        return False

## Lectura y Limpieza

In [None]:
# Definir el nombre del archivo
original = 'original.txt'

high_similarity = ['high_00.txt', 'high_01.txt', 'high_02.txt', 'high_03.txt']
low_similarity = ['low_01.txt', 'low_02.txt', 'low_03.txt']
moderate_similarity = ['moderate_01.txt', 'moderate_02.txt', 'moderate_03.txt']

similar_files = high_similarity + low_similarity + moderate_similarity

data = txts_to_df(original, similar_files)

## BoW

In [None]:
# Crear una columna que contenga un array con todas las palabras de original y edited, sin duplicados
data['BoWQ1Q2'] = data.apply(lambda x: uniqueWords(x['original'], x['edited']), axis=1)

# Crear una columna que contenga un array con la cantidad de veces que aparece cada palabra de words en q1,q2
data['q1_vecBoW'] = data.apply(lambda x: countWords(x['original'], x['BoWQ1Q2'], 'BoW'), axis=1)
data['q2_vecBoW'] = data.apply(lambda x: countWords(x['edited'], x['BoWQ1Q2'], 'BoW'), axis=1)

# Crear una columna que contenga el calculo del coseno entre los vecBow de q1 y q2
data['cos_BOW'] = data.apply(lambda x: cosine_similarity(x['q1_vecBoW'], x['q2_vecBoW']), axis=1)

## Term Frequency / Inverse Document Frequency (TF-IDF)

In [None]:
# Crear una columna que contenga un array con la cantidad de veces que aparece cada palabra de words en q1,q2
data['TF_q1'] = data.apply(lambda x: countWords(x['original'], x['BoWQ1Q2'], 'TF'), axis=1)
data['TF_q2'] = data.apply(lambda x: countWords(x['edited'], x['BoWQ1Q2'], 'TF'), axis=1)

# Crear una columna que contenga el array de IDF's de cada palabra en words
data['vecIDF'] = data.apply(lambda x: idf(x['BoWQ1Q2'], x['original'], x['edited']), axis=1)

# Crear una columna que contenga el array de TF-IDF's de cada palabra en words
data['q1_vecTFIDF'] = data.apply(lambda x: tfByIDF(x['TF_q1'], x['vecIDF']), axis=1)
data['q2_vecTFIDF'] = data.apply(lambda x: tfByIDF(x['TF_q2'], x['vecIDF']), axis=1)

# Crear una columna que contenga el calculo del coseno entre los TF-IDF de q1 y q2
data['cos_TFID'] = data.apply(lambda x: cosine_similarity(x['q1_vecTFIDF'], x['q2_vecTFIDF']), axis=1)

## Cadenas de Markov

In [None]:
# Crear una columna con una diccionario de palabras que siguen a cada palabra en q1
data['q1_wordsFollow'] = data.apply(lambda x: word_follow(x['original']), axis=1)
data['q2_wordsFollow'] = data.apply(lambda x: word_follow(x['edited']), axis=1)

# Crear una columna con una matriz de palabras que siguen a cada palabra en q1,q2
data['q1_vecMark'] = data.apply(lambda x: word_follow_matrix(x['BoWQ1Q2'], x['q1_wordsFollow']), axis=1)
data['q2_vecMark'] = data.apply(lambda x: word_follow_matrix(x['BoWQ1Q2'], x['q2_wordsFollow']), axis=1)

# Transponer la matriz de q2
data['q2T_vecMark'] = data.apply(lambda x: x['q2_vecMark'].T, axis=1)

# Multiplicar las matrices de q1 y q2
data['dotMatrix'] = data.apply(lambda x: x['q1_vecMark'].dot(x['q2T_vecMark']), axis=1)

# Aplicar traza a la matriz de q1 y q2
data['prod_int'] = data.apply(lambda x: np.trace(x['dotMatrix']), axis=1)

# Crear una columna con el calculo del coseno entre las matrices de q1, q2 y su traza
data['cos_MARK'] = data.apply(lambda x: cosine_similarity_matrix(x['prod_int'], 
                                                                               x['q1_vecMark'], 
                                                                               x['q2_vecMark']), 
                                                                               axis=1)

## Output

In [None]:
# Aplicar flatten a las matrices de q1, q2, q2T y dotMatrix como indica la actividad
data['q1_vecMark'] = data['q1_vecMark'].apply(lambda x: x.to_numpy().flatten())
data['q2_vecMark'] = data['q2_vecMark'].apply(lambda x: x.to_numpy().flatten())
data['q2T_vecMark'] = data['q2T_vecMark'].apply(lambda x: x.to_numpy().flatten())
data['dotMatrix'] = data['dotMatrix'].apply(lambda x: x.to_numpy().flatten())

# Remover columnas innecesarias
data = data.drop(columns=['BoWQ1Q2', 'q1_vecBoW', 'q2_vecBoW', 
                          'TF_q1', 'TF_q2', 'vecIDF', 'q1_vecTFIDF', 'q2_vecTFIDF',
                          'q1_wordsFollow', 'q2_wordsFollow', 'q1_vecMark', 'q2_vecMark', 'q2T_vecMark', 'dotMatrix',
                          'prod_int'])

In [None]:
# Crear una columna 'BOW_correct' que muestre si la prediccion de cos_BOW es correcta
data['BOW_correct'] = data.apply(lambda x: check_prediction(x['simGrade'], x['cos_BOW']), axis=1)
# Crear una columna 'TFIDF_correct' que muestre si la prediccion de cos_TFID es correcta
data['TFIDF_correct'] = data.apply(lambda x: check_prediction(x['simGrade'], x['cos_TFID']), axis=1)
# Crear una columna 'MARK_correct' que muestre si la prediccion de cos_MARK es correcta
data['MARK_correct'] = data.apply(lambda x: check_prediction(x['simGrade'], x['cos_MARK']), axis=1)

In [None]:
# Imprimir cuantos aciertos y errores hubo en cada metodo
data['BOW_correct'].value_counts()

In [None]:
data['TFIDF_correct'].value_counts()

In [None]:
data['MARK_correct'].value_counts()

In [None]:
# Guardar el DataFrame en un archivo CSV
data.to_csv('./outputs/compare_BoW_TF-IDF_Markov.csv', index=False)

Analizando los resultados de esta implementación,es posible concluir que ninguna tecnica es realmente efectiva si los textos son lo suficientemente distintos entre las palabras que utilizan, aunque al final vengan diciendo lo mismo. Si bien cada uno de los metodos ofrecen un enfoque distinto a los demas, es importante aclarar que no son completamente satisfactorios, pues, aunque ofrecen soluciones similares, aún tienen desempeños bastante pobres en cuando a la deteccion del grado de similitud de semejanza en los textos.