In [None]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
import bleach
from bs4 import BeautifulSoup

In [None]:
def vacancy_from_hh(base_url = 'https://api.hh.ru/vacancies?text=', 
                    search_query = 'аналитик',
                    pages_number = 20,
                    per_page = 100):
    data = []
    salary_data = pd.DataFrame()
    for page_number in tqdm(range(pages_number)):
        request = requests.get(base_url + search_query, {'per_page': str(per_page), 
                                                         'page': page_number, 
                                                         'only_with_salary':'true'})
        json_data = request.json()
        salary_data = pd.concat([salary_data, pd.json_normalize(json_data, 'items')[
            ['id', 'salary.from', 'salary.to', 'salary.currency']
        ]])
        if 'items' not in json_data:
            continue
        for short_vacancy_data in json_data['items']:

            vacancy_data = requests.get(short_vacancy_data.get('url')).json()
            data.append(vacancy_data)
        
    return pd.json_normalize(data)

In [None]:
analyst = vacancy_from_hh(search_query = 'Аналитик')
electrician = vacancy_from_hh(search_query = 'Электрик')
accountant = vacancy_from_hh(search_query = 'Бухгалтер')
waiter = vacancy_from_hh(search_query = 'Официант')
administrator = vacancy_from_hh(search_query = 'Администратор')
security = vacancy_from_hh(search_query = 'Охранник')
developer = vacancy_from_hh(search_query = 'Разработчик')

analyst['query_profession'] = 'Аналитик'
electrician['query_profession'] = 'Электрик'
accountant['query_profession'] = 'Бухгалтер'
waiter['query_profession'] = 'Официант'
administrator['query_profession'] = 'Администратор'
security['query_profession'] = 'Охранник'
developer['query_profession'] = 'Разработчик'

data = pd.concat([analyst, electrician, accountant, waiter, administrator, security, developer])

In [None]:
vacancy_hh = pd.read_json('all_professions.json')

In [None]:

bleach.clean(vacancy_hh.description[0], tags=[], strip=True)

from googletrans import Translator
from deep_translator import GoogleTranslator

skill_hard = pd.read_csv('hardskills.txt')

translator = GoogleTranslator(source='en', target='ru')
skill_hard['ru_skills'] = skill_hard['Skill'].progress_apply(translator.translate)

skill_hard['ru_skill_a'] = skill_hard['ru_skills'].apply(preprocess_text)

skill_hard.to_excel("skill_hard.xlsx", index = False)
skill_hard = pd.read_excel("skill_hard.xlsx")

def contains_skills(row):
    return pd.DataFrame([1 if re.search(f"{row}", vacancy_hh.description[i]) else 0 \
              for i in range(len(vacancy_hh.description))], 
            columns=[row])

skill_soft = pd.read_csv('skills_index_final.csv')

translator = GoogleTranslator(source='en', target='ru')
skill_soft['Skill'] = skill_soft['Skill'].str.replace('ability', '')
skill_soft['ru_skills'] = skill_soft['Skill'].progress_apply(translator.translate)

skill_soft['ru_skill_a'] = skill_soft['ru_skills'].apply(preprocess_text)

skill_soft.to_excel("skill_soft.xlsx", index = False)
skill_soft = pd.read_excel("skill_soft.xlsx")

skill_soft_one_hot = pd.concat(skill_soft['ru_skill_a'].map(contains_skills).tolist(), axis=1)
skill_soft_one_hot_ru = skill_soft_one_hot[skill_soft_one_hot.columns[skill_soft_one_hot.sum() != 0]]
one_hot_skill = skill_soft_one_hot_ru.loc[:,~skill_soft_one_hot_ru.columns.duplicated()]

skill_hard_one_hot_ru = pd.concat(skill_hard['ru_skill_a'].map(contains_skills).tolist(), axis=1)
skill_hard_one_hot_ru = skill_hard_one_hot_ru[skill_hard_one_hot_ru.columns[skill_hard_one_hot_ru.sum() != 0]]
skill_hard_one_hot = pd.concat(skill_hard['Skill'].map(contains_skills).tolist(), axis=1)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from datasets.dataset_dict import DatasetDict
from datasets import Dataset

import pyarrow as pa
import pyarrow.dataset as ds

import datasets

from sklearn.utils.class_weight import compute_class_weight
import torch

data_500 = pd.read_excel('/content/drive/MyDrive/lbgkjv/tt (1).xlsx')

data_500 = data_500.rename(
  columns={
      '-DOCSTART-': 'tokens',
      'O': 'ner_tags'
  }
)

data_train_tokens = np.split(data_500['tokens'].to_numpy(),
                             np.where(data_500['tokens'].to_numpy() == '"')[0])[1::2]

data_train_ner_tags = np.split(data_500['ner_tags'].to_numpy(),
                               np.where(data_500['tokens'].to_numpy() == '"')[0])[1::2]

data_for_ner = pd.DataFrame([data_train_tokens, data_train_ner_tags]).transpose().reset_index()

data_for_ner = data_for_ner.rename(
    columns={
        'index': 'id',
        0: 'tokens',
        1: 'ner_tags'
    }
)

id2label = {
    0: "O",
    1: "B-Hard",
    2: "I-Hard",
    3: "B-Soft",
    4: "I-Soft",
    5: "B-Another",
    6: "I-Another",
}
label2id = {
    "O": 0,
    "B-Hard": 1,
    "I-Hard": 2,
    "B-Soft": 3,
    "I-Soft": 4,
    "B-Another": 5,
    "I-Another": 6,
}

data_train_ner_tags_new = [np.array([label2id.get(i, i) for i in data_train_ner_tags[j]])
                           for j in range(len(data_train_ner_tags))]
X_train, X_test, y_train, y_test = train_test_split(data_train_tokens, data_train_ner_tags_new,
                                                    train_size=0.8,
                                                    random_state=42)
data_train = pd.DataFrame([X_train, y_train]).transpose().reset_index().rename(
    columns={
        'index': 'id',
        0: 'tokens',
        1: 'ner_tags'
    }
)

data_test = pd.DataFrame([X_test, y_test]).transpose().reset_index().rename(
    columns={
        'index': 'id',
        0: 'tokens',
        1: 'ner_tags'
    }
)

dataset_train = Dataset(pa.Table.from_pandas(data_train))
dataset_test = Dataset(pa.Table.from_pandas(data_test))
data_final = datasets.DatasetDict({"train":dataset_train,"test":dataset_test})

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

example = data_final['train'][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_wnut = data_final.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
label_list = [*label2id]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "cointegrated/rubert-tiny2", num_labels=7, id2label=id2label, label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model_",
    learning_rate=8.886156780071567e-05,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=7,
    num_train_epochs=40,
    weight_decay=.0060777108654959874,
    evaluation_strategy="steps",
    #evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=0.1,
    #metric_for_best_model = 'f1',
    #load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
import torch
#torch.cuda.empty_cache()

In [None]:
predictions, labels, _ = trainer.predict(tokenized_wnut["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

In [None]:
import nltk
import sklearn_crfsuite

from copy import deepcopy
from collections import defaultdict

from sklearn_crfsuite import metrics

In [None]:
!git clone https://github.com/davidsbatista/NER-Evaluation.git
!mv NER-Evaluation/ner_evaluation ner_evaluation

In [None]:
from ner_evaluation.ner_eval import collect_named_entities
from ner_evaluation.ner_eval import compute_metrics
from ner_evaluation.ner_eval import compute_precision_recall_wrapper

In [None]:
collect_named_entities(true_labels)[0]

In [None]:
from transformers import AutoModelForTokenClassification
import torch
from transformers import AutoTokenizer
import pandas as pd
import numpy as np


tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
id2label = {
    0: "O",
    1: "B-Hard",
    2: "I-Hard",
    3: "B-Soft",
    4: "I-Soft",
    5: "B-Another",
    6: "I-Another",
}
label2id = {
    "O": 0,
    "B-Hard": 1,
    "I-Hard": 2,
    "B-Soft": 3,
    "I-Soft": 4,
    "B-Another": 5,
    "I-Another": 6,
}

In [None]:
model = AutoModelForTokenClassification.from_pretrained("cointegrated/rubert-tiny2",
                                                        num_labels=7, id2label=id2label, label2id=label2id)

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/lbgkjv/model'))

In [None]:
text = '''
"Привет ! Ищем в команду Alanbase талантливого Frontend-разработчика на React уровня
не ниже Middle+ с широким кругозором и искренней любовью к своему делу . Мы ждем ,
что наш новый коллега поможет разрабатывать и поддерживать наш продукт , используя
самые современные технологии и методы разработки и самое главное - ему это будет нравится .
Что нужно делать : Разработка и поддержка пользовательского интерфейса сервиса Оптимизация
интерфейса для обеспечения высокой производительности и отзывчивости Участие в принятии
архитектурных и технических решений по разработке программного обеспечения Взаимодействие с
разработчиками , системным аналитиками Помощь в улучшении процесса разработки Нам важно :
Опыт от 3 лет Знание и опыт работы с TypeScript , React и Redux , с RESTful API , с Docker
Понимание адаптивного и кросс-браузерного дизайна Опыт оценки и принятия архитектурных решений
Опыт оптимизации производительности веб-приложений Желательно , но не обязательно Опыт работы с
фреймворком Next.js Знание принципов UX/UI дизайна Понимание основ бэкенд-разработки
Знакомство с affiliate сферой Что предлагаем : Оформление : ТК , ГПХ , ИП У нас
перспективы для личностного и профессионального развития , динамичная и креативная
рабочая среда , вдохновляющие проекты и возможности , и , важно , удаленка с гибким
началом рабочего дня : )"
'''

In [None]:
from transformers import pipeline

classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='average', 
                      device=0, batch_size=2)
pp = classifier(text)

In [None]:
data_for_label = pd.read_csv('/content/drive/MyDrive/lbgkjv/description_without_index.txt', 
                             header=None).iloc[:500][0].to_list()

In [None]:
data_pipe = pd.DataFrame(classifier(data_for_label))

In [None]:
classi_data = classifier(data_for_label)

In [None]:
dict_data = pd.read_csv('/content/drive/MyDrive/lbgkjv/description_with_index.txt', 
                        header=None).iloc[:500][0].to_dict()

In [None]:
dataset = []

for i in range(len(dict_data)):
    dataset.append(list(map(lambda x: {**x, **{'id': dict_data[i]}}, classi_data[i])))

In [None]:
pd.DataFrame(dataset)

In [None]:
ssss1 = pd.Series()
for i in range(pd.DataFrame(dataset).columns.shape[0]):
    ssss1 = pd.concat([pd.DataFrame(dataset).iloc[i][pd.DataFrame(dataset).iloc[i].notna()],
                    ssss1], axis=0)

In [None]:
dataset_final = pd.DataFrame(ssss1.tolist())

In [None]:
data_hh = pd.read_json('/content/drive/MyDrive/lbgkjv/all_professions.json')

In [None]:
dataset_final = dataset_final.merge(data_hh.reset_index()[
    ['index', 'name', 'key_skills', 'salary.from',
     'salary.to', 'salary.currency', 'salary.gross', 'description', 'query_profession']
    ], left_on='id', right_on='index')

In [None]:
dataset_final['salary.currency'].value_counts()

In [None]:
dataset_final.loc[~dataset_final['salary.gross'], 'salary.from'] = dataset_final.loc[~dataset_final['salary.gross'], 'salary.from'] * 100/87
dataset_final.loc[~dataset_final['salary.gross'], 'salary.to'] = dataset_final.loc[~dataset_final['salary.gross'], 'salary.to'] * 100/87
dataset_final.loc[~dataset_final['salary.gross'], 'salary.gross'] = True

In [None]:
dataset_final.loc[dataset_final['salary.from'].isna(), 'salary'] = dataset_final.loc[dataset_final['salary.from'].isna(), 'salary.to']

In [None]:
dataset_final.loc[dataset_final['salary.to'].isna(), 'salary'] = dataset_final.loc[dataset_final['salary.to'].isna(), 'salary.from']

In [None]:
dataset_final.loc[dataset_final['salary'].isna(), 'salary'] = (dataset_final.loc[dataset_final['salary'].isna(), 'salary.to'] +
    dataset_final.loc[dataset_final['salary'].isna(), 'salary.from']) / 2

In [None]:
dataset_final.to_excel('/content/drive/MyDrive/lbgkjv/dataset_final.xlsx')

In [None]:
import matplotlib.pyplot as plt

In [None]:
for_plt = dataset_final[['id', 'query_profession', 'salary.from', 
                         'salary.to']].drop_duplicates('id')[['query_profession', 'salary.from', 'salary.to']]

In [None]:
for_plt.boxplot(by='query_profession', rot = 90)

In [None]:
pd.mean(
    dataset_final.loc[dataset_final['salary'].isna(), 'salary.to'],
    dataset_final.loc[dataset_final['salary'].isna(), 'salary.from'],
    axis=1
)

In [None]:
ssss = pd.Series()
for i in range(data_pipe.columns.shape[0]):
    ssss = pd.concat([data_pipe.iloc[i][data_pipe.iloc[i].notna()],
                    ssss], axis=0)

In [None]:
data_an = pd.DataFrame(ssss.tolist())

In [None]:
data_hard = data_an.query('entity_group == "Hard"')
data_soft = data_an.query('entity_group == "Soft"')

In [None]:
from wordcloud import WordCloud, STOPWORDS
strr = ' '.join(data_hard['word'])

In [None]:
wordCloud = WordCloud(width = 10000, height = 10000, random_state=1, background_color='black',
                      colormap='Set2', collocations=False).generate(strr)

plt.figure(figsize=(5,5))
plt.imshow(wordCloud)

In [None]:
stop_words = open('/content/drive/MyDrive/lbgkjv/stop-ru.txt', 'r', encoding='utf8')
stop_words = stop_words.read()
stop_words = stop_words.split('\n')

In [None]:
clear_data=[]
for i in strr.split():
    if(i not in stop_words):
        clear_data.append(i)

In [None]:
clear_data_str = strr = ' '.join(clear_data)

In [None]:
wordCloud = WordCloud(width = 10000, height = 10000, random_state=1,
                      background_color='black', colormap='Set2', collocations=False).generate(clear_data_str)
plt.figure(figsize=(5,5))
plt.imshow(wordCloud)

In [None]:
strr_s = ' '.join(data_soft['word'])

In [None]:
wordCloud = WordCloud(width = 10000, height = 10000, random_state=1, background_color='black',
                      colormap='Set2', collocations=False).generate(strr_s)

plt.figure(figsize=(5,5))
plt.imshow(wordCloud)

In [None]:
clear_data_s=[]
for i in strr_s.split():
    if(i not in stop_words):
        clear_data_s.append(i)

In [None]:
clear_data_str_s = strr = ' '.join(clear_data_s)

In [None]:
wordCloud = WordCloud(width = 10000, height = 10000, random_state=1,
                      background_color='black', colormap='Set2', collocations=False).generate(clear_data_str_s)
plt.figure(figsize=(5,5))
plt.imshow(wordCloud)