In [1]:
import os
import datetime
import sys
import yaml

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import langid
from random import shuffle

import ipywidgets as widgets
from IPython.display import clear_output
from IPython.display import display
from contextlib import contextmanager
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

from src.model import get_model

In [2]:
with open("configs/config.yaml") as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
filename = cfg['vacancy-file']
filename_out = cfg['vacancy-train-file']

if filename in os.listdir('.'):
    df = pd.read_csv(filename, index_col=0)
else:
    raise Exception(f'There is no file {filename}, run "parse_data.py" first')
    sys.exit()

df = df[~df['text'].isna()].drop_duplicates().reset_index(drop=True)

df['salary_min'] = df['salary'].str.split('-').str[0].map(lambda x: ''.join(re.findall('\d+', str(x).split(',')[0])))
df.loc[df['salary'].isna(), 'salary_min'] = np.nan
df['salary_min'] = df['salary_min'].astype(float)
df['salary_max'] = df['salary'].str.split('-').str[1].map(lambda x: ''.join(re.findall('\d+', str(x).split(',')[0])))
df.loc[df['salary'].isna(), 'salary_max'] = np.nan
df['salary_max'] = df['salary_max'].astype(float)

df['salary_currency'] = df['salary'].str.split('-').str[0].map(
    lambda x: ''.join(re.findall('[^(0-9,. )]', str(x).lower()))).replace('nan', '')

currency_translation = {'':1, 'gbp': 1, 'usd': 0.79, 'sgd': 0.58, 
                        'jpy': 0.0055, 'mxn': 0.046, 'sek': 0.072, 
                        'eur': 0.86, 'cad': 0.59, 'aud': 0.52}
currency_to_add = set(df['salary_currency']) - set(currency_translation.keys())
if len(currency_to_add):
    print("Add: ", currency_to_add)
df['salary_max_gbp'] = df['salary_currency'].map(currency_translation) * df['salary_max']
df['salary_min_gbp'] = df['salary_currency'].map(currency_translation) * df['salary_min']

In [4]:
if filename_out in os.listdir('.'):
    train = pd.read_csv(filename_out, index_col=0).reset_index(drop=True)
    df = df[df.columns.difference(['dont_show', 'good', 'synthetic'])]
    df = df.merge(train.loc[~((~train['synthetic'].isna()) & (train['synthetic'])), 
                            ['title','company_name','location', 'dont_show', 'good', 'synthetic']], 
                  on=['title','company_name','location'], how='left')
    for col in set(df.columns) - set(train.columns):
        train[col] = np.nan
    df = pd.concat([df, train.loc[(~train['synthetic'].isna()) & (train['synthetic']), df.columns]])
    df = df.reset_index(drop=True)
else:
    df['dont_show'] = np.nan
    df['synthetic'] = np.nan
    df['good'] = np.nan
    
    
tf_idf = TfidfVectorizer(stop_words='english', max_df=0.99, min_df=0.01, norm='l1', ngram_range=(1,3))
tf_idf.fit(df['title'] + '\n' + df['text'])
model = get_model(df, tf_idf)

accordion = None
indexes = None
outs = widgets.Output()
i = 0


def calculate_palette(tf_idf, model):
    names = tf_idf.get_feature_names_out()
    result = pd.DataFrame([names, model.coef_[0]]).T
    result.columns=['names', 'feature_importance']
    result = result[abs(result['feature_importance'])>=abs(result['feature_importance']).sort_values().iloc[-30]]
    cmap = sns.diverging_palette(20, 200, s=60, as_cmap=True)
    norm = plt.Normalize(vmin=result['feature_importance'].min(), 
                         vmax=result['feature_importance'].max())
    palette = {z: cmap(norm(h)) for h, z in zip(result['feature_importance'],result['names'])}
    return palette

def highlight_words(text, palette, pattern='(?u)\\b\\w\\w+\\b'):
    key_words = [x for x in re.findall(pattern, text) if x.lower() in palette.keys()]
    res = text
    for w in key_words:
        if w not in 'span style="background-color: rgba':
            res = res.replace(w, 
                              f'<span style="background-color: rgba({palette[w.lower()][0]*255},{palette[w.lower()][1]*255},{palette[w.lower()][2]*255},1);">{w}</span>')
    return res

def get_location(df):
    locations = df['location'].dropna().drop_duplicates()
    for i in np.arange(int(locations.str.split(',').str.len().max()-1), 0, step=-1):
        locations = pd.concat([locations[locations.str.split(',').str.len() > i
                                       ].str.split(',').str[-i:].str.join(',').drop_duplicates(), 
                               locations])
    return locations.str.strip().unique()


def get_widget_panel(df, i, palette=None):
    text = ('<pre>\n\n</pre><b>' + df["title"][i] + '</b>'+ 
            (('<pre>\n' + df["salary"][i] + '</pre>') if df["salary"][i] is not np.nan else '') + 
            '<pre>\n\n' + df['text'][i] + '\n\n</pre>')
    if palette is not None:
        text = highlight_words(text, palette=palette)
    link = f'<a href="{df["link"][i]}" target="_blank">link to the vacancy</a>'
    link = f'<p style="color:blue;">{link}</p>'
    vacancy = widgets.HTML(
        value = link + text,
        placeholder='',
        description='',
        layout = widgets.Layout(width='920px')
    )

    good_bad = widgets.Dropdown(
        options=['good', 'bad'],
        value=None,
        rows=3,
        description='Good or bad?',
        disabled=False
    )
    why_good = widgets.Textarea(
        placeholder='Type something',
        description='good stuff',
        disabled=False,
        layout = widgets.Layout(width='920px', height='100px')

    )
    why_bad = widgets.Textarea(
        placeholder='Type something',
        description='bad stuff',
        disabled=False,
        layout = widgets.Layout(width='920px', height='100px')

    )
    show = widgets.Checkbox(
        value=False,
        description="Don't show it again (use only when you don't want to score)",
        disabled=False,
        indent=False,
        layout = widgets.Layout(width='920px', height='30px')
    )
    
    widg = widgets.VBox([vacancy, good_bad, why_good, why_bad, show])
    return widg


def get_filtered_df(df):
    return df[df['dont_show'].isna() & df['synthetic'].isna()]


def get_relevant_df(df):
    df_filt = get_filtered_df(df)
    if filters.children[0].children[2].value:
        df_filt = df_filt[df_filt['date'] >= str(filters.children[0].children[2].value)]
    if filters.children[1].children[0].value and filters.children[1].children[0].value != 'All':
        df_filt = df_filt[df_filt['title'].str.lower().str.find(filters.children[1].children[0].value.lower()) != -1]
    if filters.children[1].children[1].value and filters.children[1].children[1].value != 'All':
        df_filt = df_filt[df_filt['location'].str.lower().str.find(filters.children[1].children[1].value.lower()) != -1]
    if filters.children[1].children[2].value and filters.children[1].children[2].value != 'All':
        df_filt = df_filt[df_filt['company_name'] == filters.children[1].children[2].value]
    if filters.children[2].children[1].value != 0:
        df_filt = df_filt[df_filt['salary_max_gbp'] >= filters.children[2].children[1].value]
    if filters.children[2].children[2].value and filters.children[2].children[2].value != 'All':
        df_filt = df_filt[df_filt['language'] == filters.children[2].children[2].value]
        
    if filters.children[2].children[0].value != '':
        words = [x.strip() for x in (filters.children[2].children[0].value).lower().split(';')]
        ind = False
        for w in words:
            ind = ind | df_filt['text'].str.lower().str.find(w) != -1
        df_filt = df_filt[ind]
        
    if df_filt.shape[0] == 0:
        return [], widgets.Accordion(children=[], selected_index=None)
    return df_filt

def refresh_indexes(df):
    df_filt = get_relevant_df(df)
    if filters.children[0].children[1].value == 'model_score':
        indexes = df_filt['model_scores'].sort_values(ascending=False).index
    elif filters.children[0].children[1].value == 'random':
        indexes = np.array(df_filt.index)
        shuffle(indexes)
    elif filters.children[0].children[1].value == 'date':
        indexes = df_filt['date'].sort_values(ascending=False).index
    elif filters.children[0].children[1].value == 'location':
        indexes = df_filt['location'].sort_values().index
    elif filters.children[0].children[1].value == 'title':
        indexes = df_filt['title'].sort_values().index
    elif filters.children[0].children[1].value == 'salary':
        indexes = df_filt.sort_values('salary_max_gbp', ascending=False).index
    return indexes
    
def get_accordion(indexes, from_ind, to_ind):
    palette = None
    inds = indexes[from_ind:to_ind]
    if filters.children[3].children[0].value:
        palette = calculate_palette(tf_idf, model)
    accordion = widgets.Accordion(children=[get_widget_panel(df, ind, palette) for ind in inds], selected_index=None)
    for k, ind in enumerate(inds):
        accordion.set_title(k, str(df.loc[ind, 'title']) + '        ---   ' +
                            str(df.loc[ind, 'company_name']) + ' --- ' +
                            str(df.loc[ind, 'location']) + ' --- ' +
                            str(df.loc[ind, 'date']) + 
                            f" (model_score={df.loc[ind, 'model_scores']:.2f})"
                       )
    return accordion

def show_data(refresh, from_ind, to_ind, top):
    global indexes
    global accordion
    global df
    if accordion:
        if refresh:
            inds = indexes[from_ind:to_ind]
        else:
            inds = indexes[(from_ind-top):(to_ind-top)]
        for k, ind in enumerate(inds):
            if accordion.children[k].children[4].value:
                df.loc[ind, 'dont_show'] = accordion.children[k].children[4].value
            if accordion.children[k].children[1].value is not None:
                df.loc[ind, 'good'] = (accordion.children[k].children[1].value == 'good') * 1
                df.loc[ind, 'dont_show'] = True
            for is_good, text_highlight in enumerate([
                accordion.children[k].children[3].value, 
                accordion.children[k].children[2].value]):
                if text_highlight != '':
                    max_ind = int(df.index.max() + 1)
                    df.loc[max_ind, :] = df.loc[ind, :]
                    df.loc[max_ind, 'text'] = text_highlight
                    df.loc[max_ind, 'synthetic'] = True
                    df.loc[max_ind, 'date'] = str(datetime.datetime.now().date())
                    df.loc[max_ind, 'good'] = is_good
                    df.loc[ind, 'dont_show'] = True
                df[(~df['good'].isna()) | (~df['dont_show'].isna())].reset_index(drop=True).to_csv(filename_out)
    
    with outs:
        clear_output()
        if refresh:
            indexes = refresh_indexes(df)
        accordion = get_accordion(indexes, from_ind, to_ind)
        run_widg = widgets.VBox([accordion, next_prev])
        display(run_widg)

def update_data(x):
    global i
    i = 0
    top = filters.children[0].children[0].value
    show_data(refresh=True, from_ind=i, to_ind=i+top, top=top)
    
def show_next(x):
    global i
    top = filters.children[0].children[0].value
    i += top
    i = min(max(0, i), len(indexes))
    show_data(refresh=False, from_ind=i, to_ind=i+top, top=top)

def show_prev(x):
    global i
    top = filters.children[0].children[0].value
    i -= top
    i = min(max(0, i), len(indexes))
    show_data(refresh=False, from_ind=i, to_ind=i+top, top=top)

def show_start(x):
    global i
    top = filters.children[0].children[0].value
    i = 0
    show_data(refresh=False, from_ind=i, to_ind=i+top, top=top)

def show_end(x):
    global i
    top = filters.children[0].children[0].value
    i = len(indexes) - top
    show_data(refresh=False, from_ind=i, to_ind=i+top, top=top)
    
def get_filters():
    
    top_ = widgets.IntSlider(
        value=10,
        min=10,
        max=100,
        step=10,
        description='Top:',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='d',
#         layout = widgets.Layout(width='350px', height='20px')
    )
    
    title = widgets.Combobox(
            placeholder='Choose title',
            options=tuple(np.append('All', get_filtered_df(df)['title'].unique())),
            description='Title:',
            ensure_option=True,
            disabled=False
        )

    location = widgets.Combobox(
        placeholder='Choose locaction',
        options=tuple(np.append('All', get_location(get_filtered_df(df)))),
        description='Location:',
        ensure_option=True,
        disabled=False
    )
    
    company = widgets.Combobox(
        placeholder='Choose locaction',
        options=tuple(np.append('All', get_filtered_df(df)['company_name'].dropna().unique())),
        description='Company:',
        ensure_option=True,
        disabled=False
    )
    
    find_words = widgets.Text(
        value='',
        placeholder='Type something',
        description='Find words(;):',
        disabled=False
    )
    
    sort_val = widgets.Dropdown(
        value='model_score',
        options=('random', 'model_score', 'date', 'location', 'title', 'salary'),
        description='Sort by:',
        ensure_option=True,
        disabled=False
    )
    
    min_date = widgets.DatePicker(
        description='Min Date',
        disabled=False
    )
    
    salary = widgets.IntText(
        value=0,
        description='Min salary:',
        disabled=False
    )
    language = widgets.Dropdown(
        value='en',
        options=tuple(np.append('All', get_filtered_df(df)['language'].dropna().unique())),
        description='Language:',
        ensure_option=True,
        disabled=False
    )
    
    color_features = widgets.Checkbox(value=True, description='Color features?')
    retrain_model = widgets.Button(description='Retrain model', button_style='info')
    apply_filters = widgets.Button(description='Apply filters/Refresh', button_style='info')
    
    return widgets.VBox([widgets.HBox([top_, sort_val, min_date]), 
                         widgets.HBox([title, location, company]),
                         widgets.HBox([find_words, salary, language]), 
                         widgets.HBox([color_features]), 
                         widgets.HBox([apply_filters, retrain_model])
                        ], box_style='info')
@contextmanager
def show_loading():
    filters.children[4].children[1].description = 'Running...'
    filters.children[4].children[1].button_style = ''
    yield
    filters.children[4].children[1].description = 'Retrain model'
    filters.children[4].children[1].button_style = 'info'
    

def retrain_model(x):
    global model
    global metric_message
    with show_loading():
        model = get_model(df, tf_idf)
        update_data(x)
    
ind = get_filtered_df(df).index
df.loc[ind, 'model_scores'] = model.predict_proba(tf_idf.transform(df.loc[ind, 'title'] + '\n' + df.loc[ind, 'text']))[:,1]
indexes = ind
next_button = widgets.Button(description='Next', button_style='info')
prev_button = widgets.Button(description='Previous', button_style='info')
start_button = widgets.Button(description='Start', button_style='info')
end_button = widgets.Button(description='End', button_style='info')

next_prev = widgets.HBox([start_button, prev_button, next_button, end_button])
filters = get_filters()

display(filters)
display(outs)

next_prev.children[0].on_click(show_start)
next_prev.children[1].on_click(show_prev)
next_prev.children[2].on_click(show_next)
next_prev.children[3].on_click(show_end)

filters.children[4].children[0].on_click(update_data)
filters.children[4].children[1].on_click(retrain_model)



ROC-AUC (all): 0.97, (true): 0.92


VBox(box_style='info', children=(HBox(children=(IntSlider(value=10, continuous_update=False, description='Top:…

Output()