In [2]:
import pandas as pd
import numpy as np
import time, json, gzip, re
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pymystem3
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from scipy.sparse import csc_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances

In [3]:
def clean_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

## 1. read data

In [6]:
%%time
with gzip.open('vacancies.json.gzip') as f:
    data = json.load(f)
    
df = pd.DataFrame(data)

CPU times: user 1min 14s, sys: 37.4 s, total: 1min 51s
Wall time: 2min 38s


## 2. prepr

### 2.1 string feats

In [7]:
df['description'] = df['description'].apply(clean_html)
df['key_skills_str'] = df['key_skills'].apply(lambda x: ' '.join([el['name'] for el in x]) if x!=[] else None)
df['driver_license_str'] = df['driver_license_types'].apply(lambda x: ' '.join([el['id'] for el in x]) 
                                                            if x!=[] else None).fillna('no_info')

### 2.2 salary feats

In [8]:
currency_df = df[['area.name', 'salary.currency', 'id']]\
                            .groupby(['area.name', 'salary.currency'])\
                            .count()\
                            .sort_values('id', ascending=False)
currency_df['rnk'] = currency_df.groupby('area.name').cumcount()
currency_df = currency_df[currency_df['rnk']==0].reset_index()\
                                                .rename(columns={'salary.currency': 'freq_currency'})\
                                                .drop(['rnk', 'id'], axis=1)

df = df.merge(currency_df, on=['area.name'], how='left')
df['salary.currency'] = df['salary.currency'].fillna(df['freq_currency']).fillna('RUR')
currency_mapper = {'RUR': 1, 'BYR': 29.62, 'KZT': 0.16, 'UAH': 2.72, 
                   'USD': 73, 'UZS': 0.0077, 'EUR': 80, 
                   'KGS': 0.95, 'AZN': 43.41, 'GEL': 23.39}

df['salary.gross'] = df['salary.gross'].fillna(df['salary.gross'].value_counts().index[0])

df['salary_from_rur'] = df[['salary.from', 'salary.currency', 'salary.gross']]\
   .apply(lambda x: x[0]*currency_mapper[x[1]]*0.83 if x[2] else x[0]*currency_mapper[x[1]], axis=1)

df['salary_to_rur'] = df[['salary.to', 'salary.currency', 'salary.gross']]\
   .apply(lambda x: x[0]*currency_mapper[x[1]]*0.83 if x[2] else x[0]*currency_mapper[x[1]], axis=1)
df['salary_gap'] = df['salary_to_rur'] - df['salary_from_rur']

In [9]:
df['salary_from_rur'] = df[['salary.from', 'salary.currency', 'salary.gross']]\
   .apply(lambda x: x[0]*currency_mapper[x[1]]*0.83 if x[2] else x[0]*currency_mapper[x[1]], axis=1)

df['salary_to_rur'] = df[['salary.to', 'salary.currency', 'salary.gross']]\
   .apply(lambda x: x[0]*currency_mapper[x[1]]*0.83 if x[2] else x[0]*currency_mapper[x[1]], axis=1)
df['salary_gap'] = df['salary_to_rur'] - df['salary_from_rur']

## bin feats

In [10]:
bin_edges = [x for x in df['salary_from_rur'].quantile(q=np.linspace(0, 1, 11)).values]
labels = [i*10 for i, x in enumerate(bin_edges)][1:]
df['salary_from_rur_bins'] = pd.cut(df['salary_from_rur'], bins=bin_edges, labels=labels, include_lowest=True)\
                                    .cat.add_categories('0').fillna('0').astype(int)

In [11]:
bin_edges = sorted(list(set([x for x in df['salary_to_rur'].quantile(q=np.linspace(0, 1, 11)).values])))
labels = [i*10 for i, x in enumerate(bin_edges)][1:]
df['salary_to_rur_bins'] = pd.cut(df['salary_to_rur'], bins=bin_edges, labels=labels, include_lowest=True)\
                                    .cat.add_categories('0').fillna('0').astype(int)

In [12]:
bin_edges = sorted(list(set([x for x in df['salary_gap'].quantile(q=np.linspace(0, 1, 11)).values])))
labels = [i*10 for i, x in enumerate(bin_edges)][1:]
df['salary_gap_bins'] = pd.cut(df['salary_gap'], bins=bin_edges, labels=labels, include_lowest=True)\
                                    .cat.add_categories('0').fillna('0').astype(int)

### 2.3 ohe

In [13]:
n_freq_areas = 50
freq_areas = df['area.name'].value_counts().head(n_freq_areas).index
df['freq_area'] = df['area.name'].apply(lambda x: x if x in freq_areas else 'Other')

ohe_cols = ['salary.currency', 'salary.gross', 'experience.name', 
            'schedule.name', 'employment.name', 'freq_area', 'salary_from_rur_bins', 
            'salary_to_rur_bins', 'salary_gap_bins']


ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(df[ohe_cols])
ohe_matrix = ohe.transform(df[ohe_cols])

In [14]:
ohe_df = pd.DataFrame(ohe_matrix.todense(), columns=ohe.get_feature_names())

### 2.4 flag cols

In [15]:
df['with_address'] = df['address.raw'].apply(lambda x: 1 if x else 0)
df['with_metro'] = df['address.metro.line_name'].apply(lambda x: 1 if x else 0)
df['driver_license_flg'] = df['driver_license_str'].apply(lambda x: 1 if x!='no_info' else 0)
df['salary_gross_flg'] = df['salary.gross'].astype(int)

In [16]:
# driver_license_str - ohe ?

### 2.4 tf idf

In [17]:
def clean_text(data):
    data = data.lower()
    cleantext = re.sub('[^a-zа-яё0-9]', ' ', data).split()
    lemm_mystem = [mystem.lemmatize(x)[0] for x in cleantext]
    tokens_stem = [wordnet_lemmatizer.lemmatize(x, pos=wordnet.VERB) for x in lemm_mystem]
    return [x for x in tokens_stem if x not in stop_words]

In [18]:
mystem = pymystem3.Mystem()
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = ["еще", "него", "сказать", "а", "ж", "нее", "со", "без", "же", "ней", "совсем", "более", "жизнь", 
          "нельзя", "так", "больше", "за", "нет", "такой", "будет", "зачем", "ни", "там", "будто", "здесь", 
          "нибудь", "тебя", "бы", "и", "никогда", "тем", "был", "из", "ним", "теперь", "была", "из", "за",
          "них", "то", "были", "или", "ничего", "тогда", "было", "им", "но", "того", "быть", "иногда", "ну", 
          "тоже", "в", "их", "о", "только", "вам", "к", "об", "том", "вас", "кажется", "один", "тот", "вдруг",
          "как", "он", "три", "ведь", "какая", "она", "тут", "во", "какой", "они", "ты", "вот", "когда", "опять",
          "у", "впрочем", "конечно", "от", "уж", "все", "которого", "перед", "уже", "всегда", "которые", "по",
          "хорошо", "всего", "кто", "под", "хоть", "всех", "куда", "после", "чего", "всю", "ли", "потом", "человек",
          "вы", "лучше", "потому", "чем", "г", "между", "почти", "через", "где", "меня", "при", "что", "говорил",
          "мне", "про", "чтоб", "да", "много", "раз", "чтобы", "даже", "может", "разве", "чуть", "два", "можно",
          "с", "эти", "для", "мой", "сам", "этого", "до", "моя", "свое", "этой", "другой", "мы", "свою", "этом",
          "его", "на", "себе", "этот", "ее", "над", "себя", "эту", "ей", "надо", "сегодня", "я", "ему", "наконец",
          "сейчас", "если", "нас", "сказал", "есть", "не", "сказала"]

In [None]:
# %%time
# df['clean_name'] = df['name'].apply(clean_text)
# df['clean_descr'] = df['description'].apply(clean_text)
# df['clean_skills'] = df['key_skills_str'].fillna('').apply(clean_text)

In [19]:
# df['clean_name_wo_sw'] = [[word for word in x if word not in stop_words] for x in df['clean_name'].values]
# df['clean_descr_wo_sw'] = [[word for word in x if word not in stop_words] for x in df['clean_descr'].values]
# df['clean_skills_wo_sw'] = [[word for word in x if word not in stop_words] for x in df['clean_skills'].values]
# df[['clean_name_wo_sw', 'clean_descr_wo_sw', 'clean_skills_wo_sw']].to_pickle('clean_text.pickle') 

# text_name = [' '.join(x) for x in df['clean_name_wo_sw'].values]
# text_descr = [' '.join(x) for x in df['clean_descr_wo_sw'].values]
# text_skills = [' '.join(x) for x in df['clean_skills_wo_sw'].values]

text_df = pd.read_pickle('clean_text.pickle')

text_name = [' '.join(x) for x in text_df['clean_name_wo_sw'].values]
text_descr = [' '.join(x) for x in text_df['clean_descr_wo_sw'].values]
text_skills = [' '.join(x) for x in text_df['clean_skills_wo_sw'].values]

In [20]:
tfidf_name = TfidfVectorizer(max_features=500)
tfidf_name.fit(text_name)
name_matrix = tfidf_name.transform(text_name)

In [21]:
tfidf_descr = TfidfVectorizer(max_features=2000)
tfidf_descr.fit(text_descr)
descr_matrix = tfidf_descr.transform(text_descr)

In [22]:
tfidf_skills = TfidfVectorizer(max_features=1000)
tfidf_skills.fit(text_skills)
skills_matrix = tfidf_skills.transform(text_skills)

In [23]:
tfidf_matrix = np.hstack((name_matrix.todense(), descr_matrix.todense(), skills_matrix.todense()))

In [24]:
tfidf_features = [f'name_{x}' for x in tfidf_name.get_feature_names()] + \
                 [f'descr_{x}' for x in tfidf_descr.get_feature_names()] + \
                 [f'skills_{x}' for x in tfidf_skills.get_feature_names()]

## 3. matrix

In [25]:
df_matrix = df[['with_address', 'with_metro', 'driver_license_flg', 'salary_gross_flg']].copy() # 'address.lat', 'address.lng'
fin_feature_cols = list(df_matrix.columns) + list(ohe_df.columns) + tfidf_features 
fin_feature_matrix = np.hstack((df_matrix.values, ohe_df.values, tfidf_matrix))

In [26]:
sparse_feature_matrix = csr_matrix(fin_feature_matrix)

In [None]:
%%time
# cs = cosine_similarity(sparse_feature_matrix)
dist_out = pairwise_distances(sparse_feature_matrix, metric="cosine")