# Взвешивание признаков. Вакансия с наибольшим числом похожих вакансий

В файле vacansies.csv — сотни разных вакансий, некоторые из них — похожи. Найти вакансию с наибольшим числом похожих вакансий.

## Import

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import pymorphy2

## Чтение файла

In [2]:
df = pd.read_csv('vacancies.csv')
df.head()

Unnamed: 0,text
0,Старший Java-разработчик в Музыку🎧\n\nВас ждет...
1,Python-разработчик в Яндекс.Лавку🍔\n\nЯндекс.Л...
2,Фронтенд-разработчик в Вертикали🏠\n\nВертикали...
3,iOS-разработчик в Вертикали (Буткемп)🍏\n\nВерт...
4,Старший разработчик в группу разработки бессер...


## Подготовка функции 

Подготовлю функцию, которая принимает текст с описанием вакансии, токенизирует его (в данном случае токен — это отдельное слово), удаляет из множества токенов (слов) стоп-слова, приводит каждый токен (слово) к нормальной форме (лемме), возвращает предобработанный текст, который состоит из токенов (слов) в нормальной форме и не содержит стоп-слов. В текстах вакансий есть знаки препинания, переносы строк, emoji и прочее, поэтому просто использовать функцию split не получится. Вместо split использую RegexpTokenizer из NLTK, который токенизирует текст с помощью регулярного выражения: всё, что ему удовлетворяет, считается токеном.

In [3]:
tokenizer = RegexpTokenizer('\w+')
russian_stopwords = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

In [4]:
def preprocess(text):
    stemmed_words = []
    for word in tokenizer.tokenize(text):
        word = word.lower()
        if word not in russian_stopwords:
            stemmed_words.append(morph.parse(word)[0].normal_form)
    return ' '.join(stemmed_words)

## Подготовка экземпляра TfidfVectorizer

Подготовлю экземпляр TfidfVectorizer. Передам ему функцию preprocess

In [5]:
vectorizer = TfidfVectorizer(
    preprocessor=preprocess,
    norm=None
)

## Подготовка датафрейма result

Пропущу датафрейм df с текстами вакансий через TfidfVectorizer, затем создам датафрем result на основе того, что вернёт векторизатор. Столбцы этого датафрейма — это слова, строки — документы, значения в ячейках — метрика TF-IDF для данного слова в данном документе.

In [6]:
tfidf_matrix = vectorizer.fit_transform(df['text'])

result = pd.DataFrame(
    data=tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

result

Unnamed: 0,000,06jjq6ru3cyxcp,0a0a1tvvgd6qw,0h4wwufcxmp3u,0ihk9s7cecxzn,0l4_0nwk3uwaku,0lrks90zuaid4,10,100,1000,...,юридический,явление,являться,ядро,язык,языковой,яндекс,яндекс360,яп,ящик
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.635329,0.0,2.741850,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,2.741850,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,1.370925,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.635329,0.0,1.370925,0.0,0.0,0.0
621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,1.370925,0.0,0.0,0.0
622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,1.370925,0.0,0.0,0.0
623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,1.370925,0.0,0.0,0.0


## Подготовка датафрейма dist

Рассчитаю косинусное расстояние для всех векторов корпуса попарно. Для расчета использую функцию cosine_distances. Результат сохраню в датафрейм dist.

In [8]:
from sklearn.metrics.pairwise import cosine_distances

distances = cosine_distances(result)

dist = pd.DataFrame(distances)

dist.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,615,616,617,618,619,620,621,622,623,624
0,0.0,0.867277,0.892791,0.898116,0.890252,0.924523,0.863116,0.004905,0.937734,0.744903,...,0.969723,0.962532,0.948985,0.937083,0.958178,0.85299,0.901102,0.950424,0.940666,0.940666
1,0.867277,0.0,0.850028,0.883378,0.932432,0.91026,0.872761,0.866623,0.851844,0.941676,...,0.942066,0.930739,0.945752,0.96684,0.940882,0.868696,0.930068,0.851514,0.873019,0.873019
2,0.892791,0.850028,0.0,0.705472,0.956243,0.895629,0.910963,0.910157,0.919442,0.954458,...,0.782189,0.694877,0.892991,0.798895,0.957063,0.945965,0.944117,0.950632,0.916042,0.916042
3,0.898116,0.883378,0.705472,0.0,0.953029,0.922851,0.918541,0.897613,0.956812,0.953168,...,0.784778,0.886813,0.906069,0.974369,0.950082,0.937717,0.952347,0.966035,0.972255,0.972255
4,0.890252,0.932432,0.956243,0.953029,0.0,0.911678,0.921108,0.889711,0.939244,0.929367,...,0.976503,0.975582,0.965005,0.937016,0.966852,0.897681,0.917412,0.966232,0.971561,0.971561


##  Вакансия с максимальным количеством похожих на неё вакансий

Вектор косинусных расстояний у такой вакансии должен иметь большее всего элементов, значения которых меньше 0.5.

In [9]:
dist.apply(lambda x: x[x < 0.5].count()).idxmax()

143

In [10]:
df.iloc[143]

text    Разработчик-аналитик машинного обучения в Еду🍎...
Name: 143, dtype: object

In [11]:
### Дополнительно: все похожие вакансии.
dist.iloc[143][dist.iloc[143]<0.5]

143    0.000000
221    0.197556
283    0.371167
293    0.415468
359    0.427564
365    0.404273
485    0.414981
486    0.414981
Name: 143, dtype: float64

In [12]:
### Например
df.iloc[485]

text    ML-разработчик в Такси🚕\n\nНаша команда помога...
Name: 485, dtype: object