Código reutilizado de la actividad 4.4  | Modulo Métodos cuantitavos y Simulación.                             

Agregar Columnas de calculo BOW, TF-IDF y Markov

In [1]:
import pandas as pd
import numpy as np
import re, math
from sklearn.metrics.pairwise import cosine_similarity

INPUT_CSV  = "dataset.csv"
OUTPUT_CSV = "dataset_Similarity_calculation.csv"
N_ROWS     = 1000

def clean(text: str) -> str:
    
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^0-9a-záéíóúüñ\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def markov_vector(text: str, alphabet: list[str]) -> list[float]:
    
    idx = {c: i for i, c in enumerate(alphabet)}
    size = len(alphabet)
    mtx = np.zeros((size, size), dtype=float)
    text = text.replace(" ", "")
    for a, b in zip(text, text[1:]):
        if a in idx and b in idx:
            mtx[idx[a], idx[b]] += 1
    for i in range(size):
        row_sum = mtx[i].sum()
        if row_sum:
            mtx[i] /= row_sum
    return mtx.flatten().tolist()


df = pd.read_csv(INPUT_CSV, nrows=N_ROWS)

df["code1_clean"] = df["code1"].apply(clean)
df["code2_clean"] = df["code2"].apply(clean)

# Contenedores
code1_vecBoW, code2_vecBoW, cosBoW = [], [], []
code1_vecTFIDF, code2_vecTFIDF, cosTFIDF = [], [], []
code1_vecMark, code2_vecMark, cosMark = [], [], []

# Procesamiento
for s1, s2 in zip(df["code1_clean"], df["code2_clean"]):
    # Bag of Words
    vocab, index_of = [], {}
    for w in (s1.split() + s2.split()):
        if w not in index_of:
            index_of[w] = len(vocab)
            vocab.append(w)
    v1 = [0] * len(vocab)
    v2 = [0] * len(vocab)
    for w in s1.split():
        v1[index_of[w]] += 1
    for w in s2.split():
        v2[index_of[w]] += 1
    code1_vecBoW.append(v1)
    code2_vecBoW.append(v2)
    cosBoW.append(cosine_similarity([v1], [v2])[0, 0])

    # TF-IDF
    total1, total2 = sum(v1) or 1, sum(v2) or 1
    tf1 = [c / total1 for c in v1]
    tf2 = [c / total2 for c in v2]
    idf = []
    for w in vocab:
        df_w = (1 if w in s1 else 0) + (1 if w in s2 else 0)
        idf.append(math.log(2 / (df_w + 1)) + 1)
    tfidf1 = [tf1[i] * idf[i] for i in range(len(vocab))]
    tfidf2 = [tf2[i] * idf[i] for i in range(len(vocab))]
    code1_vecTFIDF.append(tfidf1)
    code2_vecTFIDF.append(tfidf2)
    cosTFIDF.append(cosine_similarity([tfidf1], [tfidf2])[0, 0])

    # Cadenas de Markov
    alphabet = sorted(set(s1.replace(" ", "") + s2.replace(" ", "")))
    vecM1 = markov_vector(s1, alphabet)
    vecM2 = markov_vector(s2, alphabet)
    code1_vecMark.append(vecM1)
    code2_vecMark.append(vecM2)
    cosMark.append(cosine_similarity([vecM1], [vecM2])[0, 0])

# Guardar resultados
df["code1_vecBoW"] = code1_vecBoW
df["code2_vecBoW"] = code2_vecBoW
df["cos_BOW"] = cosBoW
df["code1_vecTFIDF"] = code1_vecTFIDF
df["code2_vecTFIDF"] = code2_vecTFIDF
df["cos_TFID"] = cosTFIDF
df["code1_vecMark"] = code1_vecMark
df["code2_vecMark"] = code2_vecMark
df["cos_MARK"] = cosMark

df = df.drop(columns=["code1_clean", "code2_clean"])
df.to_csv(OUTPUT_CSV, index=False)
print("CSV guardado ->", OUTPUT_CSV)

# Verificación rápida
print("Primera fila:")
print("BoW   :", cosBoW[0])
print("TF-IDF:", cosTFIDF[0])
print("Markov A:", df["code1"][0])
print("Markov vector A:", code1_vecMark[0])
print("Markov B:", df["code2"][0])
print("Markov vector B:", code2_vecMark[0])
print("Markov:", cosMark[0])


CSV guardado -> dataset_Similarity_calculation.csv
Primera fila:
BoW   : 0.706597242747782
TF-IDF: 0.6451639235437321
Markov A: \npublic class T1 {\n	public static void main(String[] args) {\n		System.out.println("Welcome to Java");\n		System.out.println("Welcome to Java");\n		System.out.println("Welcome to Java");\n		System.out.println("Welcome to Java");\n		System.out.println("Welcome to Java");\n	}\n\n}\n
Markov vector A: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 