Este primer script usa el modelo de embeddings all-MiniLM-L6-v2 y se pasan esos embeddings a una red neuronal para calcular la similitud entre dos textos.

In [23]:
import pandas as pd
import tqdm
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import mean_squared_error


In [24]:
# Leer el archivo CSV y extraer la primera línea y separar los campos
df = pd.read_csv('../plain_text_resume_data.csv')
print("Largo total del dataset:", len(df))
df_train = df[:int(len(df) * 0.8)]
df_test = df[int(len(df) * 0.8):]
print("Largo del dataset de entrenamiento:", len(df_train))
print("Largo del dataset de prueba:", len(df_test))

Largo total del dataset: 9544
Largo del dataset de entrenamiento: 7635
Largo del dataset de prueba: 1909


In [25]:
# Cargar el modelo pre-entrenado desde Hugging Face
model = SentenceTransformer('all-MiniLM-L6-v2')  # Rápido para similaridad

In [26]:
def calculate_embeddings(row):
    cv_text = row.iloc[0]
    job_description = row.iloc[1]
    real_score = row.iloc[2]

    # Obtener embeddings
    cv_embedding = model.encode(cv_text, convert_to_tensor=True)
    job_embedding = model.encode(job_description, convert_to_tensor=True)

    return cv_embedding, job_embedding, real_score

In [27]:
embedding_dim = model.get_sentence_embedding_dimension()
print("Dimensión de los embeddings:", embedding_dim)

Dimensión de los embeddings: 384


In [28]:

class JobMatchingNN(nn.Module):
    def __init__(self, embedding_dim):
        super(JobMatchingNN, self).__init__()
        self.fc1 = nn.Linear(4 * embedding_dim, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 64)
        self.out = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()  # para score entre 0 y 1

    def forward(self, emb1, emb2):
        abs_diff = torch.abs(emb1 - emb2) 
        prod = emb1 * emb2
        # la diferencia absoluta y el producto se concatenan para tener más información sobre la relación entre los embeddings
        x = torch.cat([emb1, emb2, abs_diff, prod], dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.out(x))
        return x


In [29]:
'''
# Crear el trainset 
trainset = []
for i in tqdm.tqdm(range(len(df_train))):
    row = df_train.iloc[i]
    cv_embedding, job_embedding, real_score = calculate_embeddings(row)
    trainset.append((torch.tensor(cv_embedding), torch.tensor(job_embedding), torch.tensor(real_score)))

# guardar el trainset
torch.save(trainset, 'trainset_first_nn.pt')
'''

"\n# Crear el trainset \ntrainset = []\nfor i in tqdm.tqdm(range(len(df_train))):\n    row = df_train.iloc[i]\n    cv_embedding, job_embedding, real_score = calculate_embeddings(row)\n    trainset.append((torch.tensor(cv_embedding), torch.tensor(job_embedding), torch.tensor(real_score)))\n\n# guardar el trainset\ntorch.save(trainset, 'trainset_first_nn.pt')\n"

In [30]:
'''
# Crear el testset
testset = []
for i in tqdm.tqdm(range(len(df_test))):
    row = df_test.iloc[i]
    cv_embedding, job_embedding, real_score = calculate_embeddings(row)
    testset.append((torch.tensor(cv_embedding), torch.tensor(job_embedding), torch.tensor(real_score)))

# guardar el testset
torch.save(testset, 'testset_first_nn.pt')
'''

"\n# Crear el testset\ntestset = []\nfor i in tqdm.tqdm(range(len(df_test))):\n    row = df_test.iloc[i]\n    cv_embedding, job_embedding, real_score = calculate_embeddings(row)\n    testset.append((torch.tensor(cv_embedding), torch.tensor(job_embedding), torch.tensor(real_score)))\n\n# guardar el testset\ntorch.save(testset, 'testset_first_nn.pt')\n"

In [31]:
trainset = torch.load('trainset_first_nn.pt')
testset = torch.load('testset_first_nn.pt')

In [41]:
# Crear el modelo
model = JobMatchingNN(embedding_dim)
# Definir la función de pérdida y el optimizador
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Definir el número de épocas
num_epochs = 15
# Definir el tamaño del batch
batch_size = 16

# Crear un DataLoader para el conjunto de entrenamiento
train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
# Crear un DataLoader para el conjunto de prueba
test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False)


In [42]:
# Entrenar el modelo
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (cv_embedding, job_embedding, real_score) in enumerate(tqdm.tqdm(train_loader)):
        optimizer.zero_grad()
        outputs = model(cv_embedding.float(), job_embedding.float())
        loss = criterion(outputs, real_score.float().view(-1, 1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

100%|██████████| 478/478 [00:01<00:00, 329.86it/s]


Epoch [1/15], Loss: 0.0184


100%|██████████| 478/478 [00:02<00:00, 236.32it/s]


Epoch [2/15], Loss: 0.0132


100%|██████████| 478/478 [00:02<00:00, 237.42it/s]


Epoch [3/15], Loss: 0.0116


100%|██████████| 478/478 [00:02<00:00, 238.82it/s]


Epoch [4/15], Loss: 0.0105


100%|██████████| 478/478 [00:01<00:00, 276.64it/s]


Epoch [5/15], Loss: 0.0101


100%|██████████| 478/478 [00:01<00:00, 261.91it/s]


Epoch [6/15], Loss: 0.0091


100%|██████████| 478/478 [00:01<00:00, 296.04it/s]


Epoch [7/15], Loss: 0.0086


100%|██████████| 478/478 [00:01<00:00, 298.71it/s]


Epoch [8/15], Loss: 0.0084


100%|██████████| 478/478 [00:01<00:00, 277.09it/s]


Epoch [9/15], Loss: 0.0080


100%|██████████| 478/478 [00:01<00:00, 283.14it/s]


Epoch [10/15], Loss: 0.0076


100%|██████████| 478/478 [00:02<00:00, 234.81it/s]


Epoch [11/15], Loss: 0.0074


100%|██████████| 478/478 [00:02<00:00, 228.26it/s]


Epoch [12/15], Loss: 0.0072


100%|██████████| 478/478 [00:01<00:00, 264.82it/s]


Epoch [13/15], Loss: 0.0068


100%|██████████| 478/478 [00:01<00:00, 286.86it/s]


Epoch [14/15], Loss: 0.0066


100%|██████████| 478/478 [00:01<00:00, 297.30it/s]

Epoch [15/15], Loss: 0.0064





In [43]:
# Evaluar el modelo
model.eval()
predictions = []
real_scores = []
with torch.no_grad():
    for i, (cv_embedding, job_embedding, real_score) in enumerate(tqdm.tqdm(test_loader)):
        outputs = model(cv_embedding.float(), job_embedding.float())
        predictions.extend(outputs)
        real_scores.extend(real_score)
        
# Calcular la precisión
rmse = mean_squared_error(real_scores, predictions) ** 0.5
print(f"RMSE: {rmse:.4f}")

100%|██████████| 1909/1909 [00:00<00:00, 4929.76it/s]


RMSE: 0.1024
