In [None]:
!pip install sentence-transformers googletrans==3.1.0a0

In [None]:
import torch
import numpy as np
import random
import os

import pandas as pd
import json
from tqdm import tqdm, trange
from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from googletrans import Translator

from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/Dimon/TalentMatch/Data/'

In [None]:
# фиксированные значения для воспроизводимости результатов
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)  # if you are using multi-GPU.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# устанавливаем SEED еще и для хеш-функций (необходимо для Python >= 3.2.3)
os.environ['PYTHONHASHSEED'] = str(SEED)

# нужно использовать эту функцию в начале скрипта обучения модели
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()  # вызов функции для установки начального состояния генераторов случайных чисел

## utils

In [None]:
def concatenate_text_columns(row, columns):
    return ' '.join(str(row[column]) for column in columns)

In [None]:
def concatenate_text_columns_for_dict(row, token_column_mapping):
    pieces = []
    for token, column in token_column_mapping.items():
        value = str(row[column]) if row[column] is not None else ""
        if value:
            pieces.append(f"{token}: {value}")
    return ' '.join(pieces)


## Data_work

In [None]:
with open(path+'case_2_data_for_members.json', encoding='utf-8') as f:
   data = json.load(f)

In [None]:
df = pd.DataFrame(data)

In [None]:
df.sample(3)

Unnamed: 0,vacancy,failed_resumes,confirmed_resumes
27,{'uuid': 'a5d0e1fd-7baa-3a6f-98f4-b908ac7fce43...,[{'uuid': '74bce970-26fd-3cb3-84c0-8a2b636553a...,[{'uuid': '36c2de54-96dd-3168-a4c9-32f889434a2...
16,{'uuid': 'b1148bfd-c881-3e51-92fd-29912e58b38d...,[{'uuid': '3ef4544a-bab0-3fb4-b048-10ebb51e922...,[{'uuid': '205d6642-b0e1-3956-a51d-ea233c54336...
12,{'uuid': '61a5a940-c9f2-3f9f-bbda-9cf735697878...,[{'uuid': 'fa198827-f11b-3fd0-8666-175378229ec...,[{'uuid': '97116752-fe12-3633-9ace-4b5795440be...


In [None]:
# NB
triplets = []
for vac_ix, vacancy in df.iterrows():
  for failed_resume in df['failed_resumes'][vac_ix]:
    triplets.append([vacancy[0], failed_resume, 0])
  for confirmed_resume in df['confirmed_resumes'][vac_ix]:
    triplets.append([vacancy[0], confirmed_resume, 1])

In [None]:
#triplets[0][2]

In [None]:
# создадим пустой DataFrame, в который будем добавлять сформированные DataFrame
full_df = pd.DataFrame()

for ind, sublist in enumerate(triplets):
    df1 = pd.DataFrame([{'vacancy': sublist[0]}])
    df2 = pd.DataFrame([{'resume': sublist[1], 'target': sublist[2]}])
    combined_df = pd.concat([df1, df2], axis=1)
    full_df = pd.concat([full_df, combined_df], ignore_index=True)

In [None]:
#full_df.sample(3)

In [None]:
from sentence_transformers import SentenceTransformer

model_for_emb_firstly = SentenceTransformer('all-distilroberta-v1')

In [None]:
token_column_mapping_for_work = {
    'Работодатель': 'employer',
    'Город': 'city',
    'Профессия': 'position',
    'Описание': 'description'
}

token_column_mapping_for_education = {
    'Год окончания обучения': 'year',
    'Учебное учереждение': 'organization',
    'Факультет': 'faculty',
    'Специальность': 'specialty',
    'Результат обучения': 'result',
    'Тип образования': 'education_type',
    'Уровень образования': 'education_level'
}


vac_uuid = []
vac_name = []
vac_keywords = []
vac_description = []
vac_comment = []
for vacancy in full_df['vacancy']:
  vac_uuid.append(vacancy['uuid'])
  vac_name.append(vacancy['name'])
  vac_keywords.append(vacancy['keywords'])
  vac_description.append(vacancy['description'])
  vac_comment.append(vacancy['comment'])

full_df['vacancy_uuid'] = vac_uuid
full_df['vacancy_name'] = vac_name
full_df['vacancy_keywords'] = vac_keywords
full_df['vacancy_description'] = vac_description
full_df['vacancy_comment'] = vac_comment


res_uuid = []
res_name = []
res_surname = []
res_birth = []
res_country = []
res_city = []
res_about = []
res_key_skills = []
for resume in full_df['resume']:
  res_uuid.append(resume['uuid'])
  res_name.append(resume['first_name'])
  res_surname.append(resume['last_name'])
  res_birth.append(resume['birth_date'])
  res_country.append(resume['country'])
  res_city.append(resume['city'])
  res_about.append(resume['about'])
  res_key_skills.append(resume['key_skills'])

# добавим взвешенные эмбеддинги опыта работы (было бы, но время инференса...'(((sad')
work_exp = []
for ids in range(full_df['resume'].shape[0]):
  if 'experienceItem' in full_df['resume'][ids]:
    exp_text = []
    for i in full_df['resume'][ids]['experienceItem']:
      exp_text.append(concatenate_text_columns_for_dict(i, token_column_mapping_for_work))
    work_exp.append('; Следующая работа: '.join(exp_text))
  else:
    work_exp.append('Нет опыта работы')
full_df['resume_work_exp'] = work_exp


education = []
for ids in range(full_df['resume'].shape[0]):
  if 'educationItem' in full_df['resume'][ids]:
    education_text = []
    for i in full_df['resume'][ids]['educationItem']:
      education_text.append(concatenate_text_columns_for_dict(i, token_column_mapping_for_education))
    education.append('; Следующее образование: '.join(education_text))
  else:
    education.append('Нет образования')

full_df['resume_educationItem'] = education

full_df['resume_uuid'] = res_uuid
full_df['resume_name'] = res_name
full_df['resume_surname'] = res_surname
full_df['resume_birth'] = res_birth
full_df['resume_country'] = res_country
full_df['resume_city'] = res_city
full_df['resume_about'] = res_about
full_df['resume_key_skills'] = res_key_skills

In [None]:
def keywords(dataframe):
  about = []

  for i in range(len(dataframe['vacancy'])):
    about.append(dataframe['vacancy'][i]['description'])

  test = pd.DataFrame(about)
  test.columns = ['description']
  test['description'] = test['description'].apply(lambda x: " ".join(x.lower()for x in x.split()))
  # убираем табуляцию и знаки препинания
  test['description'] = test['description'].str.replace('[^\w\s]',' ')
  # цифры
  test['description'] = test['description'].str.replace('\d+', '')
  stop = ['и', 'мы', 'а']
  test['description'] = test['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

  data = full_df['resume_key_skills']
  stack_set = set()

  for i in range(data.shape[0]):
      for word in str(data[i]).split():
        word = word.replace('[^\w\s]',' ').replace('\d+', '').lower()
        stack_set.add(word)

  other_stop_words = ['в', 'c', 'из', 'для', 'на', 'умение', 'коде', 'данных', 'умение', 'проектировани', 'или', 'опыт', 'коммерческий', 'коммерческой', 'разработки', 'английский']
  test['description'] = test['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


  stack_for_train = [[] for _ in range(test.shape[0])]
  count = 0
  for text in test['description']:
    contain = set(text.split())
    for word in contain:
      if word in stack_set:
        stack_for_train[count].append(word)
    count += 1

  return stack_for_train

In [None]:
key_words_wow = keywords(full_df)

  test['description'] = test['description'].str.replace('[^\w\s]',' ')
  test['description'] = test['description'].str.replace('\d+', '')


In [None]:
full_df['vacancy_keywords'] = key_words_wow

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('russian'))

def remove_stopwords_for_keywords_column(text):
    words = text
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [None]:
full_df['vacancy_keywords'] = full_df['vacancy_keywords'].apply(remove_stopwords_for_keywords_column)
full_df['vacancy_description'] = full_df['vacancy_description'].apply(remove_stopwords)
full_df['resume_work_exp'] = full_df['resume_work_exp'].apply(remove_stopwords)
full_df['resume_educationItem'] = full_df['resume_educationItem'].apply(remove_stopwords)

In [None]:
full_df['resume_birth'] = full_df['resume_birth'].apply(lambda x: f'Дата рождения: {int(x[:4])}' if x is not None else 'Дата рождения: неизвестно')

In [None]:
full_df = full_df.fillna(' ')

In [None]:
resume_columns = [
    'resume_birth',
    'resume_country',
    'resume_city',
    'resume_about',
    'resume_key_skills',
    'resume_work_exp',
    'resume_educationItem'
]

vacancy_columns = [
    'vacancy_name',
    'vacancy_keywords',
    'vacancy_description',
    'vacancy_comment'
]

In [None]:
translator = Translator()

In [None]:
full_df.head()

Unnamed: 0,vacancy,resume,target,vacancy_uuid,vacancy_name,vacancy_keywords,vacancy_description,vacancy_comment,resume_work_exp,resume_educationItem,resume_uuid,resume_name,resume_surname,resume_birth,resume_country,resume_city,resume_about,resume_key_skills
0,{'uuid': '779f3a59-206a-3241-adc4-d7db504f960b...,{'uuid': '74392e00-ecfb-335b-9fc1-c2652dca06e5...,0,779f3a59-206a-3241-adc4-d7db504f960b,Java разработчик команда Инвестиции,oracle систем boot работы java docker описание...,Описание расширяем команды ищем разработчиков ...,450 на руки,Работодатель: МФО ХмельИнфоОрион Город: Можайс...,Год окончания обучения: 1999 Учебное учережден...,74392e00-ecfb-335b-9fc1-c2652dca06e5,Данила,Прохоров,Дата рождения: 1979,Россия,Санкт-Петербург,,"Java, Spring Boot, Java EE, SQL, Hibernate, Gi..."
1,{'uuid': '779f3a59-206a-3241-adc4-d7db504f960b...,{'uuid': '2b5ad5e1-1f31-3f3f-8a66-43cd89233672...,0,779f3a59-206a-3241-adc4-d7db504f960b,Java разработчик команда Инвестиции,oracle систем boot работы java docker описание...,Описание расширяем команды ищем разработчиков ...,450 на руки,Работодатель: МФО Вод Город: Шаховская Професс...,Год окончания обучения: 2014 Учебное учережден...,2b5ad5e1-1f31-3f3f-8a66-43cd89233672,Савва,Исаев,Дата рождения: 1991,Россия,Москва,"(Гражданин Республики Казахстан, по России им...","ООП, Java, Java Spring Framework, Функциональн..."
2,{'uuid': '779f3a59-206a-3241-adc4-d7db504f960b...,{'uuid': 'ea1ac51a-e16b-367a-9216-52fb64809db1...,0,779f3a59-206a-3241-adc4-d7db504f960b,Java разработчик команда Инвестиции,oracle систем boot работы java docker описание...,Описание расширяем команды ищем разработчиков ...,450 на руки,Работодатель: МФО Обл Город: Щёлково Профессия...,Год окончания обучения: 2009 Учебное учережден...,ea1ac51a-e16b-367a-9216-52fb64809db1,Николай,Новиков,Дата рождения: 1986,Россия,,"С 1999 года, после появления дома первого ком...","Java, Spring Framework, Hibernate ORM, SQL, Ja..."
3,{'uuid': '779f3a59-206a-3241-adc4-d7db504f960b...,{'uuid': 'ecfc02a1-592c-3ed0-a801-1ad9ab3d30b8...,0,779f3a59-206a-3241-adc4-d7db504f960b,Java разработчик команда Инвестиции,oracle систем boot работы java docker описание...,Описание расширяем команды ищем разработчиков ...,450 на руки,Работодатель: ПАО МонтажЖелДор Город: Луховицы...,Год окончания обучения: 2010 Учебное учережден...,ecfc02a1-592c-3ed0-a801-1ad9ab3d30b8,Гавриил,Новикова,Дата рождения: 1985,Россия,Новосибирск,"- Oracle Certified Associate, Java SE 7 Progra...","Java, Git, SQL, HTML, JavaScript, CSS, MySQL, ..."
4,{'uuid': '779f3a59-206a-3241-adc4-d7db504f960b...,{'uuid': 'aff6b6bd-89c2-3b2c-ab2e-0b9f76ac367c...,0,779f3a59-206a-3241-adc4-d7db504f960b,Java разработчик команда Инвестиции,oracle систем boot работы java docker описание...,Описание расширяем команды ищем разработчиков ...,450 на руки,Работодатель: МФО ЛифтГаражОрион Город: Щёлков...,Год окончания обучения: 2020 Учебное учережден...,aff6b6bd-89c2-3b2c-ab2e-0b9f76ac367c,Ярослава,Тихонова,Дата рождения: 1992,Россия,Тюмень,,"Java, Spring, Git, PostgreSQL, Hibernate ORM, ..."


In [None]:
full_df['vacancy_combined_embeddings'] = full_df.apply(lambda row: model_for_emb_firstly.encode(translator.translate(concatenate_text_columns(row, vacancy_columns)).text), axis=1)
full_df['resume_combined_embeddings'] = full_df.apply(lambda row: model_for_emb_firstly.encode(translator.translate(concatenate_text_columns(row, resume_columns)).text), axis=1)

## SiameseFFN


In [None]:
df_train, df_test = train_test_split(full_df, test_size=0.2, random_state=SEED)

In [None]:
class VacanciesResumesDataset(Dataset):
    def __init__(self, dataframe):
        self.vacancies_embeddings = dataframe['vacancy_combined_embeddings']
        self.resumes_embeddings = dataframe['resume_combined_embeddings']
        self.targets = dataframe['target']

    def __len__(self):
        return len(self.vacancies_embeddings)

    def __getitem__(self, idx):
        vacancy_embedding = torch.tensor(self.vacancies_embeddings.iloc[idx])
        resume_embedding = torch.tensor(self.resumes_embeddings.iloc[idx])
        target = torch.tensor(self.targets.iloc[idx])
        return vacancy_embedding, resume_embedding, target

In [None]:
dataset = VacanciesResumesDataset(df_train)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0)

In [None]:
dataset_test = VacanciesResumesDataset(df_test)
val_dataloader = DataLoader(dataset_test, batch_size=1)

In [None]:
cos_sim = nn.CosineSimilarity()

In [None]:
def euclidean_distance(x, y):
    return torch.sqrt(torch.sum((x - y) ** 2))

def manhattan_distance(x, y):
    return torch.sum(torch.abs(x - y))

def pearson_correlation(x, y):
    mean_x = torch.mean(x)
    mean_y = torch.mean(y)
    normalized_x = x - mean_x
    normalized_y = y - mean_y
    correlation = torch.sum(normalized_x * normalized_y)
    std_x = torch.sqrt(torch.sum(normalized_x ** 2))
    std_y = torch.sqrt(torch.sum(normalized_y ** 2))

    return correlation / (std_x * std_y)

In [None]:
def normalixe(data):

  min_val = min(i[0] for i in data)
  max_val = max(i[0] for i in data)

  normalized_data = [((x - min_val) / (max_val - min_val), y) for x, y in data]
  return normalized_data

## Models

In [None]:
in_size = model_for_emb_firstly.get_sentence_embedding_dimension() # encoder size 768

### Some another type

In [None]:
class SiameseFFN(nn.Module):
  def __init__(self, in_size, hid_size=100, out_size=2):
    super().__init__()
    self.in_size = in_size # size of input embeddings
    self.hid_size = hid_size
    self.out_size = out_size # 2 (0 or 1)
    self.fc11 = nn.Linear(in_size, hid_size)
    self.fc12 = nn.Linear(hid_size, hid_size // 2)
    self.fc13 = nn.Linear(hid_size // 2, hid_size)
    self.fc21 = nn.Linear(in_size, hid_size)
    self.fc22 = nn.Linear(hid_size, hid_size // 2)
    self.fc23 = nn.Linear(hid_size // 2, hid_size)
    self.relu = nn.ReLU()
    self.bn1 = nn.BatchNorm1d(hid_size)
    self.bn2 = nn.BatchNorm1d(hid_size // 2)

    self.classifier = nn.Linear(in_features=hid_size * 3,
                                            out_features=out_size)

  def forward(self, inp_emb1, inp_emb2):
    out11 = self.bn1(self.relu(self.fc11(inp_emb1)))
    out12 = self.bn2(self.relu(self.fc12(out11)))
    out13 = self.fc13(out12)
    out21 = self.bn1(self.relu(self.fc21(inp_emb2)))
    out22 = self.bn2(self.relu(self.fc22(out21)))
    out23 = self.fc23(out22)
    # res = F.pairwise_distance(out13, out23, keepdim=True) # F.cosine_similarity(out13, out23) - euclidean
    #logits = self.final_fc(res.unsqueeze(1)) # + mu = (cosine_sim(resume_keywords_emb, vacancy_keywords_emb))
    x = torch.cat([out13, out23, abs(out13-out23)], dim=1) # (u, v, |u-v|)
    x = self.classifier(x)
    return x

In [None]:
set_seed(35)

In [None]:
model = SiameseFFN(in_size).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters())

num_epoch = 7

for epoch in trange(num_epoch):
    model.train()
    epoch_losses = []
    epoch_acc = []
    for data in dataloader:
        vac_emb, res_emb, target = [d.to(device) for d in data]

        optimizer.zero_grad()
        logits = model(vac_emb, res_emb)
        loss_t = loss_fn(logits, target)
        epoch_losses.append(loss_t.item())
        loss_t.backward()
        optimizer.step()

        # accuracy
        preds = torch.argmax(logits.detach(), dim=1)  # вместо трэшхолда используем argmax
        accuracy = (preds == target).float().mean().item()
        epoch_acc.append(accuracy)

    print('Epoch: {}, Loss: {:.4f}, Accuracy: {:.4f}'.format(epoch + 1, np.mean(epoch_losses), np.mean(epoch_acc)))

 14%|█▍        | 1/7 [00:00<00:01,  4.24it/s]

Epoch: 1, Loss: 0.6183, Accuracy: 0.6477


 29%|██▊       | 2/7 [00:00<00:01,  4.15it/s]

Epoch: 2, Loss: 0.5877, Accuracy: 0.6686


 43%|████▎     | 3/7 [00:00<00:00,  4.11it/s]

Epoch: 3, Loss: 0.5644, Accuracy: 0.6951


 57%|█████▋    | 4/7 [00:00<00:00,  4.11it/s]

Epoch: 4, Loss: 0.5431, Accuracy: 0.7121


 71%|███████▏  | 5/7 [00:01<00:00,  4.12it/s]

Epoch: 5, Loss: 0.4933, Accuracy: 0.7367


 86%|████████▌ | 6/7 [00:01<00:00,  4.06it/s]

Epoch: 6, Loss: 0.4668, Accuracy: 0.7595


100%|██████████| 7/7 [00:01<00:00,  4.08it/s]

Epoch: 7, Loss: 0.4609, Accuracy: 0.7936





In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()
predictions = []
targets = []

with torch.no_grad():
    for data in val_dataloader:
        vac_emb, res_emb, target = [d.to(device) for d in data]
        logits = model(vac_emb, res_emb)
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)  # получаем предсказания классов
        predictions.extend(preds.to('cpu').numpy())
        targets.extend(target.to('cpu').numpy())


precision = precision_score(targets, predictions)
print(targets)
recall = recall_score(targets, predictions)
f1 = f1_score(targets, predictions)


loss_fn = nn.CrossEntropyLoss()
val_loss = 0
with torch.no_grad():
    for data in val_dataloader:
        vac_emb, res_emb, target = [d.to(device) for d in data]
        logits = model(vac_emb, res_emb)
        loss_t = loss_fn(logits, target)
        val_loss += loss_t.item()
val_loss /= len(val_dataloader)

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')
print('Validation Loss: ', val_loss)

[1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
Precision: 0.3333, Recall: 0.3000, F1-Score: 0.3158
Validation Loss:  0.7203881322233373


In [None]:
torch.save(model.state_dict(), 'rrr.pth')

## inference of SiameseFNN

In [None]:
in_size = model_for_emb_firstly.get_sentence_embedding_dimension() # encoder size 768

In [None]:
class SiameseFFN(nn.Module):
  def __init__(self, in_size, hid_size=100, out_size=2):
    super().__init__()
    self.in_size = in_size # size of input embeddings
    self.hid_size = hid_size
    self.out_size = out_size # 2 (0 or 1)
    self.fc11 = nn.Linear(in_size, hid_size)
    self.fc12 = nn.Linear(hid_size, hid_size // 2)
    self.fc13 = nn.Linear(hid_size // 2, hid_size)
    self.fc21 = nn.Linear(in_size, hid_size)
    self.fc22 = nn.Linear(hid_size, hid_size // 2)
    self.fc23 = nn.Linear(hid_size // 2, hid_size)
    self.relu = nn.ReLU()
    self.bn1 = nn.BatchNorm1d(hid_size)
    self.bn2 = nn.BatchNorm1d(hid_size // 2)

    self.classifier = nn.Linear(in_features=hid_size * 3,
                                            out_features=out_size)

  def forward(self, inp_emb1, inp_emb2):
    out11 = self.bn1(self.relu(self.fc11(inp_emb1)))
    out12 = self.bn2(self.relu(self.fc12(out11)))
    out13 = self.fc13(out12)
    out21 = self.bn1(self.relu(self.fc21(inp_emb2)))
    out22 = self.bn2(self.relu(self.fc22(out21)))
    out23 = self.fc23(out22)
    # res = F.pairwise_distance(out13, out23, keepdim=True) # F.cosine_similarity(out13, out23) - euclidean
    #logits = self.final_fc(res.unsqueeze(1)) # + mu = (cosine_sim(resume_keywords_emb, vacancy_keywords_emb))
    x = torch.cat([out13, out23, abs(out13-out23)], dim=1) # (u, v, |u-v|)
    x = self.classifier(x)
    return x

In [None]:
the_model = SiameseFFN(in_size).to(device)
the_model.load_state_dict(torch.load(path + 'rrr.pth'))

<All keys matched successfully>

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
with open(path + 'case_2_reference_without_resume_sorted.json', encoding='utf-8') as f:
   data_2 = json.load(f)

In [None]:
data_2 = [data_2]
df = pd.DataFrame(data_2)

In [None]:
# NB
triplets = []
for vac_ix, vacancy in df.iterrows():
  for resume in df['resumes'][vac_ix]:
    triplets.append([vacancy[0], resume])

In [None]:
# создадим пустой DataFrame, в который будем добавлять сформированные DataFrame
full_df_1 = pd.DataFrame()

for ind, sublist in enumerate(triplets):
    df1 = pd.DataFrame([{'vacancy': sublist[0]}])
    df2 = pd.DataFrame([{'resume': sublist[1]}])
    combined_df = pd.concat([df1, df2], axis=1)
    full_df_1 = pd.concat([full_df_1, combined_df], ignore_index=True)

In [None]:
full_df_1.head()

Unnamed: 0,vacancy,resume
0,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': '0dfe8e63-d7a3-3fe4-b9d7-1b8122158f33...
1,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': 'f8b69e24-e2c0-3186-9578-380835eb2ee7...
2,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': 'e3976e74-e71b-34db-8e98-08dc422fa567...
3,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': '9a9c3ff1-49f8-30dd-a294-e56fc60cae64...
4,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': '6561771c-7ef3-3e50-ab3a-ba8547201480...


In [None]:
model_for_emb_firstly = SentenceTransformer('all-distilroberta-v1')

In [None]:
token_column_mapping_for_work = {
    'Работодатель': 'employer',
    'Город': 'city',
    'Профессия': 'position',
    'Описание': 'description'
}

token_column_mapping_for_education = {
    'Год окончания обучения': 'year',
    'Учебное учереждение': 'organization',
    'Факультет': 'faculty',
    'Специальность': 'specialty',
    'Результат обучения': 'result',
    'Тип образования': 'education_type',
    'Уровень образования': 'education_level'
}


vac_uuid = []
vac_name = []
vac_keywords = []
vac_description = []
vac_comment = []
for vacancy in full_df_1['vacancy']:
  vac_uuid.append(vacancy['uuid'])
  vac_name.append(vacancy['name'])
  vac_keywords.append(vacancy['keywords'])
  vac_description.append(vacancy['description'])
  vac_comment.append(vacancy['comment'])

full_df_1['vacancy_uuid'] = vac_uuid
full_df_1['vacancy_name'] = vac_name
full_df_1['vacancy_keywords'] = vac_keywords
full_df_1['vacancy_description'] = vac_description
full_df_1['vacancy_comment'] = vac_comment


res_uuid = []
res_name = []
res_surname = []
res_birth = []
res_country = []
res_city = []
res_about = []
res_key_skills = []
for resume in full_df_1['resume']:
  res_uuid.append(resume['uuid'])
  res_name.append(resume['first_name'])
  res_surname.append(resume['last_name'])
  res_birth.append(resume['birth_date'])
  res_country.append(resume['country'])
  res_city.append(resume['city'])
  res_about.append(resume['about'])
  res_key_skills.append(resume['key_skills'])

# добавим взвешенные эмбеддинги опыта работы (было бы, но время инференса...'(((sad')
work_exp = []
for ids in range(full_df_1['resume'].shape[0]):
  if 'experienceItem' in full_df_1['resume'][ids]:
    exp_text = []
    for i in full_df_1['resume'][ids]['experienceItem']:
      exp_text.append(concatenate_text_columns_for_dict(i, token_column_mapping_for_work))
    work_exp.append('; Следующая работа: '.join(exp_text))
  else:
    work_exp.append('Нет опыта работы')
full_df_1['resume_work_exp'] = work_exp


education = []
for ids in range(full_df_1['resume'].shape[0]):
  if 'educationItem' in full_df_1['resume'][ids]:
    education_text = []
    for i in full_df_1['resume'][ids]['educationItem']:
      education_text.append(concatenate_text_columns_for_dict(i, token_column_mapping_for_education))
    education.append('; Следующее образование: '.join(education_text))
  else:
    education.append('Нет образования')

full_df_1['resume_educationItem'] = education

full_df_1['resume_uuid'] = res_uuid
full_df_1['resume_name'] = res_name
full_df_1['resume_surname'] = res_surname
full_df_1['resume_birth'] = res_birth
full_df_1['resume_country'] = res_country
full_df_1['resume_city'] = res_city
full_df_1['resume_about'] = res_about
full_df_1['resume_key_skills'] = res_key_skills

In [None]:
full_df_1['resume_birth'] = full_df_1['resume_birth'].apply(lambda x: f'Дата рождения: {int(x[:4])}' if x is not None else 'Дата рождения: неизвестно')

In [None]:
full_df_1 = full_df_1.fillna(' ')

In [None]:
resume_columns = [
    'resume_birth',
    'resume_country',
    'resume_city',
    'resume_about',
    'resume_key_skills',
    'resume_work_exp',
    'resume_educationItem'
]

vacancy_columns = [
    'vacancy_name',
    'vacancy_keywords',
    'vacancy_description',
    'vacancy_comment'
]

In [None]:
translator = Translator()

In [None]:
full_df_1.head()

Unnamed: 0,vacancy,resume,vacancy_uuid,vacancy_name,vacancy_keywords,vacancy_description,vacancy_comment,resume_work_exp,resume_educationItem,resume_uuid,resume_name,resume_surname,resume_birth,resume_country,resume_city,resume_about,resume_key_skills
0,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': '0dfe8e63-d7a3-3fe4-b9d7-1b8122158f33...,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,Java разработчик,"Kafka, Java, RxJava, Hystrix, MongoDB",Требования: 4+ года опыта работы с Java 8+ или...,,Работодатель: МКК АсбоцементАлмаз Город: Кашир...,Год окончания обучения: 2021 Учебное учережден...,0dfe8e63-d7a3-3fe4-b9d7-1b8122158f33,Клим,Тетерина,Дата рождения: 1991,Россия,Москва,"Личные качества: ответственность, лидерство, г...","Java Core, Spring Framework, Hibernate ORM, Po..."
1,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': 'f8b69e24-e2c0-3186-9578-380835eb2ee7...,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,Java разработчик,"Kafka, Java, RxJava, Hystrix, MongoDB",Требования: 4+ года опыта работы с Java 8+ или...,,Работодатель: ПАО ХозАвто Город: Талдом Профес...,Нет образования,f8b69e24-e2c0-3186-9578-380835eb2ee7,Алиса,Ситникова,Дата рождения: 1990,Россия,Нижний Новгород,Обязанности: в составе группы поддержки я отв...,"Java 8-17, Java SE, Java EE, Spring Framework ..."
2,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': 'e3976e74-e71b-34db-8e98-08dc422fa567...,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,Java разработчик,"Kafka, Java, RxJava, Hystrix, MongoDB",Требования: 4+ года опыта работы с Java 8+ или...,,Работодатель: ООО Компания Рос Город: Мытищи П...,Год окончания обучения: 2009 Учебное учережден...,e3976e74-e71b-34db-8e98-08dc422fa567,Розалина,Андреев,Дата рождения: 1990,Россия,Санкт-Петербург,Обучаемость Коммуникабельность Организованнос...,"Java 8 и 11, osgi, postgresql, testng, mockito..."
3,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': '9a9c3ff1-49f8-30dd-a294-e56fc60cae64...,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,Java разработчик,"Kafka, Java, RxJava, Hystrix, MongoDB",Требования: 4+ года опыта работы с Java 8+ или...,,Работодатель: ОАО ПивТехCиб Город: Серпухов Пр...,Год окончания обучения: 2013 Учебное учережден...,9a9c3ff1-49f8-30dd-a294-e56fc60cae64,Антон,Кудрявцева,Дата рождения: 1990,Россия,Москва,Общий опыт разработки на различных языках про...,"Java EE, Spring Framework, Intellij IDEA, Рабо..."
4,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': '6561771c-7ef3-3e50-ab3a-ba8547201480...,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,Java разработчик,"Kafka, Java, RxJava, Hystrix, MongoDB",Требования: 4+ года опыта работы с Java 8+ или...,,Работодатель: ООО Компания ЮпитерХмельГлавЛизи...,Год окончания обучения: 2015 Учебное учережден...,6561771c-7ef3-3e50-ab3a-ba8547201480,Александра,Панова,Дата рождения: 1995,Россия,Москва,"О себе: Java/Kotlin-разработчик, знаю Spring ...",


In [None]:
full_df_1['vacancy_combined_embeddings'] = full_df_1.apply(lambda row: model_for_emb_firstly.encode(translator.translate(concatenate_text_columns(row, vacancy_columns)).text), axis=1)
full_df_1['resume_combined_embeddings'] = full_df_1.apply(lambda row: model_for_emb_firstly.encode(translator.translate(concatenate_text_columns(row, resume_columns)).text), axis=1)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
the_model.eval()
probability = []
predictions = []
index_vac = []
index_res = []

with torch.no_grad():
  for ind, row in full_df_1.iterrows():
    vacancies_embeddings = row['vacancy_combined_embeddings']
    resumes_embeddings = row['resume_combined_embeddings']
    vac_emb = torch.unsqueeze(torch.tensor(vacancies_embeddings).to(device), 0)
    res_emb = torch.unsqueeze(torch.tensor(resumes_embeddings).to(device), 0)

    logits = the_model(vac_emb, res_emb)
    probs = torch.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1)
    probability.append(probs[0, 1].to('cpu').item())
    if probs[0, 1].to('cpu').item() > 0.25:
      predictions.append(1)
    else:
      predictions.append(0)
    index_vac.append(row['vacancy_uuid'])
    index_res.append(row['resume_uuid'])

data = {
    'Probability': probability,
    'Predictions': predictions,
    'Index_vac': index_vac,
    'Index_res': index_res
}
df_win = pd.DataFrame(data)
df_win

Unnamed: 0,Probability,Predictions,Index_vac,Index_res
0,0.138056,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,0dfe8e63-d7a3-3fe4-b9d7-1b8122158f33
1,0.232937,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,f8b69e24-e2c0-3186-9578-380835eb2ee7
2,0.042568,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,e3976e74-e71b-34db-8e98-08dc422fa567
3,0.173835,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,9a9c3ff1-49f8-30dd-a294-e56fc60cae64
4,0.195359,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,6561771c-7ef3-3e50-ab3a-ba8547201480
...,...,...,...,...
108,0.085184,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,82df355a-235e-3046-9e6e-782ddf1600eb
109,0.171772,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,915597ce-24e5-31fa-8dca-29437f49f839
110,0.154896,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,f288a532-0b58-30cb-ac3c-f87e53984719
111,0.144432,0,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,3e3a379f-226e-305e-b7d8-cf341e00cbd7


In [None]:
df_sorted = df_win.sort_values('Probability', ascending=False)
df_sorted.to_csv('sorted_data.csv', index=False)

In [None]:
df_sorted.head()

Unnamed: 0,Probability,Predictions,Index_vac,Index_res
79,0.498744,1,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,c70de373-9f3a-3647-ab66-f25e98c29409
112,0.464495,1,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,cc88bf96-f0b9-313a-abce-dbe60b6f1c98
74,0.403946,1,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,37cba700-eed6-3018-bad6-f720f8217aeb
56,0.324371,1,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,653dadb9-5c19-3f6a-8207-7e55e7db331a
41,0.313488,1,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,8b1dc5d0-dde1-31be-851f-a643ae235d50


In [None]:
full_df_1[full_df_1['vacancy_uuid'] == '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f']['vacancy_description'][0]

'Требования: 4+ года опыта работы с Java 8+ или Kotlin 4+ года опыта работы с Spring и 2+ год работы с Spring Boot. Опыт работы с системами на микросервисной архитектуре (Spring Cloud, Kubernetes, Openshift или аналоги). Опыт работы с Docker. Опыт работы с 3-мя любыми из следующих технологий NoSQL (MongoDB, Elasticsearch, аналоги) SQL (PostgreSQL, Oracle, аналоги) Брокеры сообщений (Kafka, RabbitMQ, аналоги) Reactive programming (RxJava, Project Reactor) Cache (Redis, Hazelcast). Настройка CI/CD (GitlabCI, Jenkins, аналоги). Настройка средств мониторинга (Zabbix, Prometheus). Настройка средств логирования (Graylog, ELK). Будет хорошо, если кандидат: Знает все технологии из первого пункта; Участвовал в проработке архитектуры и может объяснить все решения на своем проекте; Готов драйвить техническое развитие систем, а не просто следовать текущим стандартам; Имеет дружеские отношения с DevOps, пайплайны, контейнеризация и оркестрация, Linux; Понимает микросервисную архитектуры и устройств

In [None]:
full_df_1[full_df_1['resume_uuid'] == 'c70de373-9f3a-3647-ab66-f25e98c29409']

Unnamed: 0,vacancy,resume,vacancy_uuid,vacancy_name,vacancy_keywords,vacancy_description,vacancy_comment,resume_work_exp,resume_educationItem,resume_uuid,resume_name,resume_surname,resume_birth,resume_country,resume_city,resume_about,resume_key_skills,vacancy_combined_embeddings,resume_combined_embeddings
79,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': 'c70de373-9f3a-3647-ab66-f25e98c29409...,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,Java разработчик,"Kafka, Java, RxJava, Hystrix, MongoDB",Требования: 4+ года опыта работы с Java 8+ или...,,Работодатель: ОАО ТелеИнжМор Город: Талдом Про...,Год окончания обучения: 2023 Учебное учережден...,c70de373-9f3a-3647-ab66-f25e98c29409,Тарас,Хохлова,Дата рождения: 1993,Россия,Москва,,"Java, Spring Framework (Data, MVC, Security), ...","[0.0046033477, -0.07264344, -0.044480376, -0.0...","[0.0058360705, -0.017241351, -0.009310142, -0...."


In [None]:
full_df_1[full_df_1['resume_uuid'] == 'cc88bf96-f0b9-313a-abce-dbe60b6f1c98']

Unnamed: 0,vacancy,resume,vacancy_uuid,vacancy_name,vacancy_keywords,vacancy_description,vacancy_comment,resume_work_exp,resume_educationItem,resume_uuid,resume_name,resume_surname,resume_birth,resume_country,resume_city,resume_about,resume_key_skills,vacancy_combined_embeddings,resume_combined_embeddings
112,{'uuid': '8b9c8d16-c7f0-38a2-b80c-d94030c15a6f...,{'uuid': 'cc88bf96-f0b9-313a-abce-dbe60b6f1c98...,8b9c8d16-c7f0-38a2-b80c-d94030c15a6f,Java разработчик,"Kafka, Java, RxJava, Hystrix, MongoDB",Требования: 4+ года опыта работы с Java 8+ или...,,Работодатель: ООО Компания РечТверь Город: Вид...,Год окончания обучения: 2023 Учебное учережден...,cc88bf96-f0b9-313a-abce-dbe60b6f1c98,Марат,Мясников,Дата рождения: 1988,Россия,Москва,,"Kotlin, Java, Project Reactor","[0.0046033477, -0.07264344, -0.044480376, -0.0...","[0.021093352, -0.02603522, -0.018342977, -0.00..."
