## Фильтрация вакансий по профессиям и размерам.

In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

In [None]:
PROFESSIONS_PATH = "../INPUT_DATA/professions.xlsx"
INPUT_FILE = "../INPUT_DATA/hh_2023-01-01_2023-04-01.csv.bz2"
OUTPUT_FILE = "resutls/filtered_vacancies.csv.gz"

CHUNKSIZE = 16
SIM_THRESHOLD = 0.95
MAX_RESULTS = 50
MIN_DESC_LEN = 100
MAX_DESC_LEN = 2000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Использовали лёгкую модель, так как очень много данных и могли себе позволить отбросить сложные нюансы.
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device=DEVICE)

In [None]:
prof_df = pd.read_excel(PROFESSIONS_PATH)
professions = prof_df["профессия"].dropna().astype(str).str.lower().tolist()
profession_embeds = model.encode(professions, convert_to_tensor=True, normalize_embeddings=True)

In [None]:
def process_chunk(chunk: pd.DataFrame) -> pd.DataFrame:
    chunk = chunk.dropna(subset=["name", "description"]).copy()
    chunk["name_clean"] = chunk["name"].str.lower().str.strip()
    chunk["desc_len"] = chunk["description"].str.len()

    chunk = chunk[(chunk["desc_len"] >= MIN_DESC_LEN) & (chunk["desc_len"] <= MAX_DESC_LEN)].copy()

    if chunk.empty:
        return pd.DataFrame()

    names = chunk["name_clean"].tolist()
    name_embeds = model.encode(names, convert_to_tensor=True, normalize_embeddings=True)

    sim_matrix = util.cos_sim(name_embeds, profession_embeds)
    max_scores, best_indices = torch.max(sim_matrix, dim=1)

    chunk["similarity"] = max_scores.cpu().numpy()
    chunk["best_profession"] = [professions[i] for i in best_indices]

    filtered = chunk[chunk["similarity"] >= SIM_THRESHOLD].copy()
    return filtered[["_id", "name", "best_profession", "description"]]

In [None]:
print("Начинаем обработку")
reader = pd.read_csv(INPUT_FILE, compression="bz2", chunksize=CHUNKSIZE, dtype={"_id": str})
total_filtered = []
total_count = 0

for chunk in tqdm(reader, desc="Чтение чанков"):
    try:
        filtered = process_chunk(chunk)
        if not filtered.empty:
            total_filtered.append(filtered)
            total_count += len(filtered)
            tqdm.write(f"Добавлено {len(filtered)} вакансий (всего: {total_count})")
            if total_count >= MAX_RESULTS:
                break
    except Exception as e:
        tqdm.write(f"Ошибка в чанке: {e}")
        continue

result_df = pd.concat(total_filtered, ignore_index=True)
result_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig", compression="gzip")
print(f"Сохранено {len(result_df)} строк в {OUTPUT_FILE}")