In [2]:
! pip install python-dotenv



In [35]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import gdown
import os
from dotenv import load_dotenv

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [4]:
ROOT_DIR: str = os.path.dirname(os.path.abspath(os.curdir))
DATASET_PATH: str = f"{ROOT_DIR}/source_data/superset_hr.xlsx"
GOOGLE_COLAB_PATH: str = '/content/superset_hr.xlsx'
load_dotenv()

if DATASET_URL := os.environ.get('DATASET_URL') is None:
    DATASET_URL = input('>>> .env was not found! Provide url for downloading: ')

def get_original_dataframe() -> pd.DataFrame:
    try:
        df = pd.read_excel(DATASET_PATH)
    except FileNotFoundError:
        try:
            print(f"Can't open file from path: {DATASET_PATH}", end='\n\n')
            df = pd.read_excel(GOOGLE_COLAB_PATH)
        except FileNotFoundError:
            print(f"Can't open file from path: {GOOGLE_COLAB_PATH}", end='\n\n')
            file_name = gdown.download(DATASET_URL, fuzzy=True)
            os.makedirs(os.path.dirname(DATASET_PATH), exist_ok=True)
            os.rename(os.path.abspath(file_name), DATASET_PATH)
            print(f'Moved to: {DATASET_PATH}', end='\n\n')
            df = pd.read_excel(DATASET_PATH)
    finally:
        print('Success!')
        return df

In [5]:
df_original = get_original_dataframe()

Success!


In [6]:
print(df_original.shape)
df_original.sample(3)

(175455, 24)


Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,income_name,name,published_at,created_at,collected_at,url,...,address__city,schedule__name,grade,employment__name,key_skills__names,languages__names,exchange_rate,salary_from_gross,salary_to_gross,salary_average
19115,19313,19313,19313,93345077,1С программист,Программист 1С WMS,2024-02-16T13:17:14+0300,2024-02-16T13:17:14+0300,2024-02-17 08:34:55.057,https://api.hh.ru/vacancies/93345077?host=hh.ru,...,Нижний Новгород,Полный день,Middle (3-6),Синергетик,"['WMS', '1С: Логистика']",[],,,,
170610,173830,173830,173802,97744727,Frontend,Junior Frontend-developer (React/Vue.js),2024-04-23T17:12:48+0300,2024-04-23T17:12:48+0300,2024-04-25 08:45:47.060,https://api.hh.ru/vacancies/97744727?host=hh.ru,...,Санкт-Петербург,Полный день,Intern (0-1),Полная занятость,"['HTML5', 'CSS3', 'Ionic', 'WebRTC', 'React/Re...",[],1.0,90000.0,110000.0,100000.0
21918,22147,22147,22147,92973935,1C консультант,Специалист – Консультант 1С:ЗУП,2024-02-09T18:26:39+0300,2024-02-09T18:26:39+0300,2024-02-18 08:59:49.710,https://api.hh.ru/vacancies/92973935?host=hh.ru,...,,Полный день,Junior (1-3),"Светофор, Сеть магазинов низких цен","['1С: Зарплата и управление персоналом', 'Конс...",[],1.0,90400.0,,


In [75]:
def print_df_info(df: pd.DataFrame) -> None:
    print(f"Shape: {df.shape}")
    print('-' * 50)
    display(df.sample(3))
    print('-' * 50)
    display(df.info())
    print('-' * 50)
    display(df.isna().sum())
    print('-' * 50)


def get_clear_df_version(df: pd.DataFrame) -> pd.DataFrame:
    df_new = df[['id', 'income_name', 'area__name', 'schedule__name', 'grade',
                 'key_skills__names', 'salary__currency', 'employer__name', 'languages__names',
                 'exchange_rate', 'salary_from_gross', 'salary_to_gross', 'salary_average']].copy()

    df_new = df_new.dropna(subset=['salary_from_gross', 'salary_to_gross', 'salary_average'], how='all')
    df_new = df_new.dropna(subset=['income_name'])

    df_new['not_rur'] = df_new['exchange_rate'] != 1
    df_new['not_rur'] = df_new['not_rur'].astype(int)
    df_new = df_new.drop(['salary__currency'], axis=1)

    return df_new


def get_difference_percentiles(df: pd.DataFrame) -> tuple[float, float, float]:
    filtered_df = df.dropna(subset=['salary_from_gross', 'salary_to_gross']).copy()
    filtered_df.loc[:, 'difference'] = filtered_df['salary_to_gross'] - filtered_df['salary_from_gross']
    condition = filtered_df['difference'] > 0.8 * filtered_df['salary_to_gross']
    filtered_df.loc[condition, ['salary_from_gross', 'difference']] = np.nan

    filtered_df = filtered_df.dropna(subset=['salary_from_gross', 'salary_to_gross'])

    filtered_df = filtered_df[filtered_df['salary_from_gross'] >= 1000]
    filtered_df = filtered_df[filtered_df['salary_to_gross'] >= 10000]

    filtered_df.loc[:, 'difference_ratio'] = np.where(filtered_df['salary_to_gross'] != 0, 
                                                      filtered_df['difference'] / filtered_df['salary_to_gross'], 
                                                      np.nan)

    perc25 = filtered_df['difference_ratio'].quantile(0.25)
    perc50 = filtered_df['difference_ratio'].quantile(0.50)
    perc75 = filtered_df['difference_ratio'].quantile(0.75)

    return perc25, perc50, perc75


def fill_na_salary(df: pd.DataFrame, coef) -> pd.DataFrame:
    result_df = df.copy()
    result_df['salary_to_gross'] = result_df['salary_to_gross'].fillna(result_df['salary_from_gross'] / (1-coef))
    result_df['salary_from_gross'] = result_df['salary_from_gross'].fillna(result_df['salary_to_gross'] * (1-coef))
    result_df['salary_average'] = result_df['salary_average'].fillna((result_df['salary_to_gross'] + result_df['salary_from_gross']) / 2)
    return result_df


def exctract_features(df: pd.DataFrame) -> pd.DataFrame:
    transformers = [
        ('schedule_name', OneHotEncoder(sparse_output=False, drop='first'), ['schedule__name']),
        ('grade', OneHotEncoder(sparse_output=False, drop='first'), ['grade'])
    ]

    pipeline = Pipeline(steps=[
        ('column_transformer', ColumnTransformer(transformers=transformers, remainder='passthrough'))
    ])

    display(pipeline)

    df_transformed = pipeline.fit_transform(df)
    column_names = pipeline.named_steps['column_transformer'].get_feature_names_out()
    transofrmed_cols = [x for x in column_names if 'remainder' not in x]
    column_names = [x.replace('remainder__', '') if x not in transofrmed_cols else x for x in column_names]
    
    df_new = pd.DataFrame(df_transformed, columns=column_names)
    df_new[transofrmed_cols] = df_new[transofrmed_cols].astype(float)
    
    return df_new

In [None]:
df_clear = get_clear_df_version(df_original)
print_df_info(df_clear)

In [None]:
perc25, perc50, perc75 = get_difference_percentiles(df_clear)
perc25, perc50, perc75

In [None]:
df_perc25 = fill_na_salary(df_clear, perc25)
print_df_info(df_perc25)

In [76]:
extracted_df_perc25 = exctract_features(df_perc25)
print_df_info(extracted_df_perc25)

Shape: (109404, 18)
--------------------------------------------------


Unnamed: 0,schedule_name__schedule__name_Гибкий график,schedule_name__schedule__name_Полный день,schedule_name__schedule__name_Сменный график,schedule_name__schedule__name_Удаленная работа,grade__grade_Junior (1-3),grade__grade_Middle (3-6),grade__grade_Senior (>6),id,income_name,area__name,key_skills__names,employer__name,languages__names,exchange_rate,salary_from_gross,salary_to_gross,salary_average,not_rur
75096,0.0,1.0,0.0,0.0,1.0,0.0,0.0,93836906,Менеджер по продажам,Астана,"['Развитие продаж', 'Телефонные переговоры', '...",Stratton,[],0.206,41200.0,92700.0,66950.0,1
54405,0.0,1.0,0.0,0.0,1.0,0.0,0.0,94479939,Менеджер по продажам,Екатеринбург,[],Омега Холдинг,[],1.0,79100.0,169500.0,124300.0,0
672,0.0,0.0,0.0,1.0,0.0,1.0,0.0,91335751,Data Engineer,Москва,"['Python', 'SQL', 'Spark', 'Airflow', 'Orаcle'...",TOPSELLER,[],1.0,339000.0,427434.782609,383217.391304,0


--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109404 entries, 0 to 109403
Data columns (total 18 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   schedule_name__schedule__name_Гибкий график     109404 non-null  float64
 1   schedule_name__schedule__name_Полный день       109404 non-null  float64
 2   schedule_name__schedule__name_Сменный график    109404 non-null  float64
 3   schedule_name__schedule__name_Удаленная работа  109404 non-null  float64
 4   grade__grade_Junior (1-3)                       109404 non-null  float64
 5   grade__grade_Middle (3-6)                       109404 non-null  float64
 6   grade__grade_Senior (>6)                        109404 non-null  float64
 7   id                                              109404 non-null  object 
 8   income_name                                     10940

None

--------------------------------------------------


schedule_name__schedule__name_Гибкий график       0
schedule_name__schedule__name_Полный день         0
schedule_name__schedule__name_Сменный график      0
schedule_name__schedule__name_Удаленная работа    0
grade__grade_Junior (1-3)                         0
grade__grade_Middle (3-6)                         0
grade__grade_Senior (>6)                          0
id                                                0
income_name                                       0
area__name                                        0
key_skills__names                                 0
employer__name                                    0
languages__names                                  0
exchange_rate                                     0
salary_from_gross                                 0
salary_to_gross                                   0
salary_average                                    0
not_rur                                           0
dtype: int64

--------------------------------------------------
