
### 1. Read & clean up data

Datasets is open-source and were taken from https://huggingface.co/datasets/arkodeep/jobdata (english vacancies) and https://www.kaggle.com/datasets/vyacheslavpanteleev1/hhru-it-vacancies-from-20211025-to-20211202 (russian vacancies)


#### 1. read

We do not need all the data in the tables, so will throw out some of the columns


In [2]:
import pandas as pd

data_en = pd.read_csv('vacancies-skills_datasets/jobs_all.csv')
data_ru = pd.read_csv('vacancies-skills_datasets/IT_vacancies_full.csv')

In [3]:
data_en.columns

Index(['job_title_short', 'job_title', 'job_location', 'job_via',
       'job_schedule_type', 'job_work_from_home', 'search_location',
       'job_posted_date', 'job_no_degree_mention', 'job_health_insurance',
       'job_country', 'salary_rate', 'salary_year_avg', 'salary_hour_avg',
       'company_name', 'job_skills', 'job_type_skills', 'key_id'],
      dtype='object')

In [4]:
data_ru.columns

Index(['Ids', 'Employer', 'Name', 'Salary', 'From', 'To', 'Experience',
       'Schedule', 'Keys', 'Description', 'Area', 'Professional roles',
       'Specializations', 'Profarea names', 'Published at'],
      dtype='object')

In [5]:
data_en = data_en[['job_title_short','job_title','job_skills', 'job_type_skills']]
data_ru = data_ru[['Name','Specializations', 'Keys']]

In [6]:
data_en.head()

Unnamed: 0,job_title_short,job_title,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,,
1,Data Analyst,Data Analyst,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."


In [7]:
data_ru.head()

Unnamed: 0,Name,Specializations,Keys
0,Golang Developer (Кипр),"['Программирование, Разработка']","['Docker', 'Golang', 'Redis', 'Английский язык..."
1,Е-mail маркетолог,['Маркетинг'],"['Грамотность', 'Написание текстов', 'Грамотна..."
2,Оператор call-центра (удаленно),"['Маркетинг', 'Продажи по телефону, Телемаркет...","['Клиентоориентированность', 'Ориентация на ре..."
3,Ведущий SMM специалист,"['Управление маркетингом', 'PR, Маркетинговые ...","['Продвижение бренда', 'Креативность', 'Adobe ..."
4,UX/UI Designer,"['Игровое ПО', 'Программирование, Разработка',...","['UI', 'UX', 'gamedev', 'game design', 'проект..."



#### 2. Clean up text

Transfer everything to the lower register, remove Nans

In [18]:
import re

def clean(text):
    text = text.lower() #lower case
    text = re.sub(r'[-/()]', ' ', text)
    return text

for dataset in data_en, data_ru:
    for col in dataset:
        dataset[col] = dataset[col].apply(clean)

data_en = data_en.dropna()
data_ru = data_ru.dropna()

In [21]:
data_en.head()

Unnamed: 0,job_title_short,job_title,job_skills,job_type_skills
1,data analyst,data analyst,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,data engineer,"data engineer scientist analyst, mid or senior...","['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,data engineer,lead engineer principal analyst principal ...,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,data engineer,data engineer sr jobs,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."
5,data engineer,gcp data engineer,"['python', 'sql', 'gcp']","{'cloud': ['gcp'], 'programming': ['python', '..."



#### 3. tokenize & lemmatize

Tried spacy, but failed to install, so will use stanza


In [29]:
!pip install stanza





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [36]:
import stanza

stanza.download('en')
stanza.download('ru')

nlp_en = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma', use_gpu=False)
nlp_ru = stanza.Pipeline('ru', processors='tokenize,pos,lemma', use_gpu=False)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-06-16 13:07:53 INFO: Downloaded file to C:\Users\user\stanza_resources\resources.json
2025-06-16 13:07:53 INFO: Downloading default packages for language: en (English) ...
2025-06-16 13:07:55 INFO: File exists: C:\Users\user\stanza_resources\en\default.zip
2025-06-16 13:08:02 INFO: Finished downloading models and saved to C:\Users\user\stanza_resources


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-06-16 13:08:02 INFO: Downloaded file to C:\Users\user\stanza_resources\resources.json
2025-06-16 13:08:02 INFO: Downloading default packages for language: ru (Russian) ...
2025-06-16 13:08:05 INFO: File exists: C:\Users\user\stanza_resources\ru\default.zip
2025-06-16 13:08:11 INFO: Finished downloading models and saved to C:\Users\user\stanza_resources
2025-06-16 13:08:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-06-16 13:08:12 INFO: Downloaded file to C:\Users\user\stanza_resources\resources.json
2025-06-16 13:08:14 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2025-06-16 13:08:14 INFO: Using device: cpu
2025-06-16 13:08:14 INFO: Loading: tokenize
2025-06-16 13:08:14 INFO: Loading: mwt
2025-06-16 13:08:14 INFO: Loading: pos
2025-06-16 13:08:18 INFO: Loading: lemma
2025-06-16 13:08:19 INFO: Done loading processors!
2025-06-16 13:08:19 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-06-16 13:08:19 INFO: Downloaded file to C:\Users\user\stanza_resources\resources.json
2025-06-16 13:08:20 INFO: Loading these models for language: ru (Russian):
| Processor | Package            |
----------------------------------
| tokenize  | syntagrus          |
| pos       | syntagrus_charlm   |
| lemma     | syntagrus_nocharlm |

2025-06-16 13:08:20 INFO: Using device: cpu
2025-06-16 13:08:20 INFO: Loading: tokenize
2025-06-16 13:08:20 INFO: Loading: pos
2025-06-16 13:08:24 INFO: Loading: lemma
2025-06-16 13:08:28 INFO: Done loading processors!


In [45]:
def lemmatize_bulk(texts, nlp):
    docs = nlp.bulk_process([t.lower() for t in texts])  # lower first if needed
    return [" ".join([word.lemma for sentence in doc.sentences for word in sentence.words])
            for doc in docs]

def lemmatize_list_bulk(items, nlp):
    return lemmatize_bulk(items, nlp)

def process_df(pdf, nlp, title_field='job_title', skills_field='job_skills'):
    pdf = pdf.copy()
    print('starting lemmatization of job titles')
    pdf[f'{title_field}_lemma'] = lemmatize_bulk(pdf[title_field].tolist(), nlp)

    print('starting lemmatization of skills')
    skills = [item for sublist in pdf[skills_field] for item in sublist]
    lemmatized_skills = lemmatize_bulk(skills, nlp)

    idx = 0
    lemmatized_skill_groups = []
    for skills in pdf[skills_field]:
        length = len(skills)
        lemmatized_skill_groups.append(lemmatized_skills[idx:idx + length])
        idx += length

    pdf[f'{skills_field}_lemma'] = lemmatized_skill_groups

    return pdf


In [None]:
data_en_lemmatized = process_df(data_en, nlp_en)
data_en_lemmatized.head()

starting lemmatization of job titles


In [None]:
data_ru_lemmatized = process_df(data_ru, nlp_ru, title_field='Name', skills_field='Keys')
data_ru_lemmatized.head()