In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
import pandas as pd
import plotly.express as px

In [None]:
def highlight_entities_in_text(text, entities, colors, title=None):
    """
    entities: {"start": 0, "end": 4, "label": 'eco'}
    """
    from spacy.displacy.render import EntityRenderer
    from IPython.display import HTML
    renderer = EntityRenderer({'colors': colors})
    entities = sorted(entities, key=lambda x: x['start'])
    res = renderer.render_ents(text=text, spans=entities, title=title)
    return HTML(res)

In [None]:
def transform_token_df_to_entity_df(df_tok_pred, bio_tag_col='tag', start_col='start', end_col='end', sent_id_col='id',
                                   is_bio=True):
    return (df_tok_pred
     .assign(
        label=lambda df: df[bio_tag_col].str.slice(2, None) if is_bio else df[bio_tag_col],
        is_begin=lambda df: df[bio_tag_col].str.slice(0, 1) == 'B' if is_bio else (df[bio_tag_col] != 'O') & (df[bio_tag_col].shift() != df[bio_tag_col]),
        start=lambda df: df[start_col][df['is_begin']]
     )
     .query(f'{bio_tag_col} != "O"')
     .ffill()
     .groupby([sent_id_col, bio_tag_col, start_col])[end_col].max()
     .reset_index()
     .astype({'start': 'int'})
)

In [None]:
# label_map = {'O': 0,
#  'I-estate_type': 1,
#  'I-city': 2,
#  'I-square': 3,
#  'I-bedrooms': 4,
#  'I-price_monthly': 5,
#  'I-district': 6,
#  'I-additional_costs': 7,
#  'I-price_daily': 8,
#  'I-price_arbitrary': 9}
label_map = {'O': 0,
             'I-city': 1,
 'I-district': 2,
 'I-estate_type': 3,
 'I-price_monthly': 4,
 'I-square': 5,
 'O': 6}
label_map_inv = {v:k[2:] for k,v in label_map.items()}
colors = dict(zip({k[2:] for k in label_map.keys()}, px.colors.qualitative.Set2))

# load model

In [None]:
model = AutoModelForTokenClassification.from_pretrained('../models/rubert_conv_220811-1337/checkpoint-260/')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../models/rubert_conv_220811-1337/checkpoint-260/')

In [None]:
'../models/rubert_conv_220811-1256/checkpoint-70/'

In [None]:
pipe = pipeline(task='ner', model=model, tokenizer=tokenizer)

# example

In [None]:
texts = [
    'Сдается квартира в Подгорице, 50м2, стоимость за месяц 900 евро',
    '#сдам #будва\n\nСдам апартаменты, за месяц 223 е',
    "\nСдаётся на 2 летних месяца квартира в Будве с одной спальней. \nС 1 июля по 31 августа. \nЦена 1100 в месяц + коммунальные платежи. \n2 этаж, площадь 40 м. \nСтиральная машина, кондиционер, интернет ADSL. \n\nЛокация \n",
    "#podgorica Apartaments available for rent in Podgorica, 500 euro, 50 m2",
    "Сниму Сдам Подгорица Сдаётся новая однокомнатная квартира в дом Дади, рядом с бульваром Иосипа Броз Тито, Старый Аэродром. Квартира имеет площадь 51 м2 и расположена на третьем этаже здания."
]
text = texts[0]
res = pipe(text)

In [None]:
import sys
sys.path.append('../src')

In [None]:
df_res = pd.DataFrame(res)
df_res['tag'] = df_res['entity'].str.slice(6, 7).astype(int).map(label_map_inv)
df_res

In [None]:
df_res_ent = transform_token_df_to_entity_df(df_res.assign(id=1), is_bio=False).query('tag != ""')

In [None]:
df_res_ent

In [None]:
highlight_entities_in_text(text, 
                           df_res_ent.rename(columns={'tag': 'label'}).to_dict(orient='records'), 
                           colors=colors)

In [None]:
for text in texts:
    res = pipe(text)

    df_res = pd.DataFrame(res)
    df_res['tag'] = df_res['entity'].str.slice(6, 7).astype(int).map(label_map_inv)

    df_res_ent = transform_token_df_to_entity_df(df_res.assign(id=1), is_bio=False).query('tag != ""')

    display(highlight_entities_in_text(text, 
                               df_res_ent.rename(columns={'tag': 'label'}).to_dict(orient='records'), 
                               colors=colors))
    print('-'*80)

In [None]:
pd.DataFrame(pipe(texts[-1]))

In [None]:
def transform_ner_output_to_ents(res):
    df_res = pd.DataFrame(res)
    df_res['tag'] = df_res['entity'].str.slice(6, 7).astype(int).map(label_map_inv)
    

    df_res_ent = transform_token_df_to_entity_df(df_res.assign(id=1), is_bio=False)
    return df_res_ent

# texts

In [None]:
import json
with open('../models/model_ner_0802/val_ids.json') as f:
    val_ids = json.load(f)

In [None]:
# df = pd.read_json('../data/output (1).manifest', lines=True).set_index('id')

# df = df.loc[val_ids]

In [None]:
df = pd.read_json('../data/df_final_5.json').set_index('id')

df = df.loc[val_ids]

In [None]:
df

In [None]:
df_val_true = df['ner'].explode().dropna().apply(pd.Series).rename(
    columns={'startOffset': 'start', 'endOffset': 'end'})
df_val_true

In [None]:
df

In [None]:
%%time
pred = pipe(df['source'].str.lower().to_list())

In [None]:
df_val_pred = pd.concat([transform_ner_output_to_ents(res).drop(columns=['id']) for res in pred], keys=df.index, names=['id'])

In [None]:
df_val_pred = df_val_pred.query('tag != ""')

In [None]:
df_val_pred

In [None]:
idx=val_ids[9]
display(highlight_entities_in_text(df.loc[idx, 'source'],
                           df_val_pred.loc[[idx]].rename(columns={'tag': 'label'}).to_dict(orient='records'),
                           colors=colors,
                           title='Predicted'
                          ))

display(highlight_entities_in_text(df.loc[idx, 'source'],
                           df_val_true.loc[[idx]].to_dict(orient='records'),
                           colors=colors,
                           title='Actual'
                          ))

In [None]:
idx=val_ids[1]
display(highlight_entities_in_text(df.loc[idx, 'source'],
                           df_val_pred.loc[idx].rename(columns={'tag': 'label'}).to_dict(orient='records'),
                           colors=colors,
                           title='Predicted'
                          ))

display(highlight_entities_in_text(df.loc[idx, 'source'],
                           df_val_true.loc[idx].to_dict(orient='records'),
                           colors=colors,
                           title='Actual'
                          ))

In [None]:
df_val_true.query('label == "city"').sort_index()

In [None]:
df_val_pred.query('tag == "city"').sort_index()

In [None]:
df_val_true['label'].value_counts()

In [None]:
df_comp = df_val_true.query('label not in ["bedrooms", "additional_costs", "price_daily", "price_arbitrary"]').assign(true=1).merge(
    df_val_pred.assign(pred=1).rename(columns={'tag': 'label'}),
    on=['id', 'label', 'start', 'end'],
    how='outer'
)

In [None]:
df_comp_counts = df_comp[['true', 'pred']].isnull().groupby('id').sum()

In [None]:
df_comp_counts.sort_values(['true', 'pred'])

In [None]:
idx=28029
display(highlight_entities_in_text(df.loc[idx, 'source'],
                           df_val_pred.loc[idx].rename(columns={'tag': 'label'}).to_dict(orient='records'),
                           colors=colors,
                           title='Predicted'
                          ))

display(highlight_entities_in_text(df.loc[idx, 'source'],
                           df_val_true.loc[idx].to_dict(orient='records'),
                           colors=colors,
                           title='Actual'
                          ))