Este primer script usa el modelo de embeddings all-MiniLM-L6-v2 y se pasan esos embeddings a una red neuronal para calcular la similitud entre dos textos.

La red nueronal tiene como entrada los embeddings de los dos textos, concatenados con otro embedding que es la diferencia entre los dos textos y otro que es la multiplicación entre ambos. Luego, la salida es de 1 dimensión, que representa la similitud entre los textos.

In [19]:
import pandas as pd
import tqdm
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import mean_squared_error


In [20]:
# Leer el archivo CSV 
df = pd.read_csv('../plain_text_resume_data.csv')
print("Largo total del dataset:", len(df))
df_train = df[:int(len(df) * 0.8)]
df_test = df[int(len(df) * 0.8):]
print("Largo del dataset de entrenamiento:", len(df_train))
print("Largo del dataset de prueba:", len(df_test))

Largo total del dataset: 9544
Largo del dataset de entrenamiento: 7635
Largo del dataset de prueba: 1909


In [23]:
# Cargar el modelo pre-entrenado desde Hugging Face
#model = SentenceTransformer('all-MiniLM-L6-v2')  # Rápido para similaridad
# model = SentenceTransformer("BAAI/bge-base-en-v1.5")
from transformers import AutoModel
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5", torch_dtype=torch.float16)


In [24]:
def calculate_embeddings(row):
    cv_text = row.iloc[0]
    job_description = row.iloc[1]
    real_score = row.iloc[2]

    # Obtener embeddings
    cv_embedding = model.encode(cv_text, convert_to_tensor=True, normalize_embeddings=True)
    job_embedding = model.encode(job_description, convert_to_tensor=True, normalize_embeddings=True)

    return cv_embedding, job_embedding, real_score

In [None]:
embedding_dim = model.get_sentence_embedding_dimension()
print("Dimensión de los embeddings:", embedding_dim)

AttributeError: 'BertModel' object has no attribute 'get_sentence_embedding_dimension'

In [26]:

class JobMatchingNN(nn.Module):
    def __init__(self, embedding_dim):
        super(JobMatchingNN, self).__init__()
        self.fc1 = nn.Linear(4 * embedding_dim, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 64)
        self.out = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()  # para score entre 0 y 1

    def forward(self, emb1, emb2):
        abs_diff = torch.abs(emb1 - emb2) 
        prod = emb1 * emb2
        # la diferencia absoluta y el producto se concatenan para tener más información sobre la relación entre los embeddings
        x = torch.cat([emb1, emb2, abs_diff, prod], dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.out(x))
        return x


In [27]:
'''
# Crear el trainset 
trainset = []
for i in tqdm.tqdm(range(len(df_train))):
    row = df_train.iloc[i]
    cv_embedding, job_embedding, real_score = calculate_embeddings(row)
    trainset.append((torch.tensor(cv_embedding), torch.tensor(job_embedding), torch.tensor(real_score)))

# guardar el trainset
torch.save(trainset, 'trainset_first_nn.pt')
'''

"\n# Crear el trainset \ntrainset = []\nfor i in tqdm.tqdm(range(len(df_train))):\n    row = df_train.iloc[i]\n    cv_embedding, job_embedding, real_score = calculate_embeddings(row)\n    trainset.append((torch.tensor(cv_embedding), torch.tensor(job_embedding), torch.tensor(real_score)))\n\n# guardar el trainset\ntorch.save(trainset, 'trainset_first_nn.pt')\n"

In [28]:
'''
# Crear el testset
testset = []
for i in tqdm.tqdm(range(len(df_test))):
    row = df_test.iloc[i]
    cv_embedding, job_embedding, real_score = calculate_embeddings(row)
    testset.append((torch.tensor(cv_embedding), torch.tensor(job_embedding), torch.tensor(real_score)))

# guardar el testset
torch.save(testset, 'testset_first_nn.pt')
'''

"\n# Crear el testset\ntestset = []\nfor i in tqdm.tqdm(range(len(df_test))):\n    row = df_test.iloc[i]\n    cv_embedding, job_embedding, real_score = calculate_embeddings(row)\n    testset.append((torch.tensor(cv_embedding), torch.tensor(job_embedding), torch.tensor(real_score)))\n\n# guardar el testset\ntorch.save(testset, 'testset_first_nn.pt')\n"

In [29]:
trainset = torch.load('trainset_first_nn.pt')
testset = torch.load('testset_first_nn.pt')

In [36]:
# Crear el modelo
model = JobMatchingNN(embedding_dim)
# Definir la función de pérdida y el optimizador
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Definir el número de épocas
num_epochs = 20
# Definir el tamaño del batch
batch_size = 16

# Crear un DataLoader para el conjunto de entrenamiento
train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
# Crear un DataLoader para el conjunto de prueba
test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False)


In [37]:
# Entrenar el modelo
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (cv_embedding, job_embedding, real_score) in enumerate(tqdm.tqdm(train_loader)):
        optimizer.zero_grad()
        outputs = model(cv_embedding.float(), job_embedding.float())
        loss = criterion(outputs, real_score.float().view(-1, 1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

100%|██████████| 478/478 [00:00<00:00, 570.42it/s]


Epoch [1/20], Loss: 0.0175


100%|██████████| 478/478 [00:01<00:00, 453.41it/s]


Epoch [2/20], Loss: 0.0130


100%|██████████| 478/478 [00:01<00:00, 454.81it/s]


Epoch [3/20], Loss: 0.0114


100%|██████████| 478/478 [00:00<00:00, 484.18it/s]


Epoch [4/20], Loss: 0.0106


100%|██████████| 478/478 [00:00<00:00, 508.07it/s]


Epoch [5/20], Loss: 0.0101


100%|██████████| 478/478 [00:01<00:00, 467.22it/s]


Epoch [6/20], Loss: 0.0093


100%|██████████| 478/478 [00:00<00:00, 502.21it/s]


Epoch [7/20], Loss: 0.0087


100%|██████████| 478/478 [00:01<00:00, 477.77it/s]


Epoch [8/20], Loss: 0.0083


100%|██████████| 478/478 [00:00<00:00, 496.13it/s]


Epoch [9/20], Loss: 0.0078


100%|██████████| 478/478 [00:01<00:00, 469.20it/s]


Epoch [10/20], Loss: 0.0078


100%|██████████| 478/478 [00:01<00:00, 439.74it/s]


Epoch [11/20], Loss: 0.0075


100%|██████████| 478/478 [00:01<00:00, 459.64it/s]


Epoch [12/20], Loss: 0.0070


100%|██████████| 478/478 [00:01<00:00, 477.00it/s]


Epoch [13/20], Loss: 0.0069


100%|██████████| 478/478 [00:00<00:00, 504.02it/s]


Epoch [14/20], Loss: 0.0067


100%|██████████| 478/478 [00:00<00:00, 507.36it/s]


Epoch [15/20], Loss: 0.0064


100%|██████████| 478/478 [00:01<00:00, 469.28it/s]


Epoch [16/20], Loss: 0.0064


100%|██████████| 478/478 [00:01<00:00, 453.25it/s]


Epoch [17/20], Loss: 0.0062


100%|██████████| 478/478 [00:01<00:00, 469.37it/s]


Epoch [18/20], Loss: 0.0060


100%|██████████| 478/478 [00:00<00:00, 484.02it/s]


Epoch [19/20], Loss: 0.0058


100%|██████████| 478/478 [00:01<00:00, 438.07it/s]

Epoch [20/20], Loss: 0.0056





In [39]:
# Evaluar el modelo
model.eval()
predictions = []
real_scores = []
with torch.no_grad():
    for i, (cv_embedding, job_embedding, real_score) in enumerate(tqdm.tqdm(test_loader)):
        outputs = model(cv_embedding.float(), job_embedding.float())
        predictions.extend(outputs)
        real_scores.extend(real_score)
        
# Calcular la precisión
rmse = mean_squared_error(real_scores, predictions) ** 0.5
print(f"RMSE: {rmse:.4f}")

100%|██████████| 1909/1909 [00:00<00:00, 6477.82it/s]


RMSE: 0.0950
