Este primer script usa el modelo de embeddings all-MiniLM-L6-v2 y se compara la similitud coseno entre dos textos.

Es bastante simple, por ejemplo si la Job Description es igual al CV, el resultado del matcheo es 1.0.

In [1]:
import pandas as pd
import tqdm
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Leer el archivo CSV y extraer la primera línea y separar los campos
df = pd.read_csv('../plain_text_resume_data.csv')
print(len(df))

9544


In [4]:
# Cargar el modelo pre-entrenado desde Hugging Face
model = SentenceTransformer('all-MiniLM-L6-v2')  # Rápido para similaridad

In [5]:
def calculate_similarity(row, score=True):
    cv_text = row.iloc[0]
    job_description = row.iloc[1]

    # Obtener embeddings
    cv_embedding = model.encode(cv_text, convert_to_tensor=True)
    job_embedding = model.encode(job_description, convert_to_tensor=True)

    # Calcular similaridad coseno
    predicted_score = util.cos_sim(cv_embedding, job_embedding).item()  # valor entre -1 y 1

    if score:
        real_score = row.iloc[2]
        return predicted_score, real_score
    
    return predicted_score



In [24]:
# Aplicar la función a cada fila del DataFrame
RMSE = 0
RMSE_cero_a_cien = 0
cantidad_de_datos = len(df)

for i in tqdm.tqdm(range(cantidad_de_datos)):
    row = df.iloc[i]
    score, real_score = calculate_similarity(row)
    scaled_scores = round((score + 1) / 2 * 100, 2), round((real_score + 1) / 2 * 100, 2) # por si acaso
    RMSE += (score - real_score) ** 2
    RMSE_cero_a_cien += (scaled_scores[0] - scaled_scores[1]) ** 2


  cv_text = row[0]
  job_description = row[1]
  real_score = row[2]
100%|██████████| 9544/9544 [08:26<00:00, 18.85it/s]


In [25]:
RMSE = (RMSE / cantidad_de_datos) ** 0.5
RMSE_cero_a_cien = (RMSE_cero_a_cien / cantidad_de_datos) ** 0.5
print(f"RMSE: {RMSE}")
print(f"RMSE (0-100): {RMSE_cero_a_cien}")

RMSE: 0.22868232063846028
RMSE (0-100): 11.434088650019119


In [7]:
# Evaluar en dos casos particulares

# Buen caso
df_good = pd.read_csv('cv_jd_good_fit.csv')
good_score = calculate_similarity(df_good.iloc[0], score=False)

# Mal caso
df_bad = pd.read_csv('cv_jd_bad_fit.csv')
bad_score = calculate_similarity(df_bad.iloc[0], score=False)

print(f"Good case score: {good_score}")
print(f"Bad case score: {bad_score}")

Good case score: 0.7639485597610474
Bad case score: 0.4717787504196167
