In [None]:
#hide
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from nbdev import *

In [None]:
# default_exp annotate

# Annotate

> Functionality to support creating and process annotation for samples of Newspaper Navigator data using Label Studio.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from tqdm.notebook import trange, tqdm
import pandas as pd
from pandas import json_normalize
import rapidjson as json
import requests
import re
from glob import glob
from pathlib import Path
import rapidjson as json

# Setup annotation task
The bulk of annotation work is outsourced to labelstudio, label studio is a flexible annotations system which supports annotations for a range of types of data including images and text. This module does a few steps to help process annotations produced through label studio. 

```bash
label-studio init advert_annotations --template=image_classification --input-path=images --input-format=image-dir --allow-serving-local-files
```

```bash
label-studio start ./advert_annotations
```

# Process annotations

In [None]:
#export
def load_df(json_file):
    with open(json_file) as f:
        data = json.load(f)
        df = json_normalize(data,record_path=['completions'],meta=['data'])
       # df['result'] = df['result'].apply(lambda x: return_choice(x[0]) if len([x][0]) ==1 else x)
        df['result'] = df['result'].apply(lambda x: x[0]['value']['choices'] if len([x][0]) ==1 else x)
        return df

In [None]:
#export
def load_completions(path):
    filenames = glob(f'{path}/completions/*.json')
    dataframes = [load_df(f) for f in filenames]
    return pd.concat(dataframes)

In [None]:
#slow
df = load_completions('../ph/ads/ad_annotations/')
df.head(1)

Unnamed: 0,created_at,id,lead_time,result,data
0,1602236711,379001,1.248,[text-only],{'image': 'http://localhost:8081/data/upload/9...


In [None]:
#slow
df = load_completions('../ph/photos/multi_label/')
df.head(1)

Unnamed: 0,created_at,id,lead_time,result,data,skipped,was_cancelled
0,1596460221,3721001,8.94,"[human, landscape]",{'image': '/data/vi_yes_ver01_data_sn84025841_...,,


In [None]:
#export
def _df_to_csv(df,out_fn):
    df[['data','result']].to_csv(out_fn,header=['file','label',],index=False)

In [None]:
#export
def _df_to_json(df,out_fn):
    df[['data','value.choices']].to_json(out_fn)

In [None]:
#export
def _df_to_pkl(df,out_fn):
    df.to_pickle(out_fn)

In [None]:
#export
def get_og_filepath(x):
    """
    Transforms a filepaths from processed ImageStudio format back to the Orginal Newspaper Navigator filepath  format
    """
    b, m, e = re.split('(_data_)',x)
    m = m.replace('_','/')
    e = re.split('(\d{3}_\d{1}_\d{2}.jpg)',e)
    return b+m+e[0].replace('_','/') +e[1]

In [None]:
#export
def anno_sample_merge(sample_df: pd.DataFrame, annotation_df: pd.DataFrame) -> pd.DataFrame:
    """anno_sample_merge merges a DataFrame containing a sample 
    from Newspaper Navigator and a DataFrame containing annotations

    Parameters
    ----------
    sample_df : pd.DataFrame
        A Pandas DataFrame which holds a sample from Newspaper Navigator Generated by `sample.nnSample()`
    annotation_df : pd.DataFrame
        A pandas DataFrame containing annotations loaded via the `annotate.nnAnnotations` class


    Returns
    -------
    pd.DataFrame
        A new DataFrame which merges the two input DataFrames
    """
    sample_df, annotation_df = sample_df.copy(), annotation_df.copy()
    annotation_df['id'] = annotation_df['data'].map(lambda x:get_og_filepath(x))
    return sample_df.merge(annotation_df, left_on='filepath',right_on='id')

In [None]:
sample_df= pd.read_csv('../ph/ads/sample.csv', index_col=0)

In [None]:
#export

class nnAnnotations:
    def __init__(self, df):
        self.annotation_df = df
        self.labels = df['result'].unique()
        self.label_counts = df['result'].value_counts()

    def __repr__(self):
        return (f'{self.__class__.__name__}'
                f' #annotations:{len(self.annotation_df)}')

    @classmethod
    def from_completions(cls, path, kind, drop_dupes=True, sample_df=None):
        df = load_completions(path)
        df = df.reset_index(drop=True) # add index
        df['data']= df['data'].map(lambda x: x['image'])
        df['data'] = df['data'].map(lambda x: x.split('?')[0])
        df['data'] = df['data'].apply(lambda x: Path(x).name)
        if any(df['data'].str.contains('-')): # removes labelstudio hash from data loaded via web interface
            df['data'] = df['data'].str.split('-',expand=True)[1]
        if drop_dupes:
            df = df.drop_duplicates(subset='data',keep='last')
        if kind=='classification':
            empty_rows = df[df['result'].apply(lambda x:len(x)==0)].index
            df = df.drop(empty_rows)
            df['result'] = df['result'].map(lambda x: x[0])
        if kind=='label':
            df['result'] = df['result'].map(lambda x: "|".join(map(str,x)) if len(x) >=1 else x)
            df['result'] = df['result'].map(lambda x:"" if len(x)==0 else x)
        return cls(df)

    def merge_sample(self, sample_df):
        self.merged_df = anno_sample_merge(sample_df,self.annotation_df)

    def export_merged(self, out_fn):
        self.merged_df.to_csv(out_fn)

    def export_annotations(self, out_fn):
        df = self.annotation_df
        if not Path(out_fn).exists():
            Path(out_fn).touch()
        suffix = Path(out_fn).suffix
        if suffix == '.csv':
            _df_to_csv(df, out_fn)
        if suffix == '.json':
            _df_to_json(df,out_fn)
        if suffix == '.pkl':
            _df_to_pkl(df,out_fn)

In [None]:
annotations = nnAnnotations.from_completions('../ph/ads/ad_annotations/', 'classification')

In [None]:
annotations

nnAnnotations #annotations:549

In [None]:
annotations.merge_sample(sample_df)
annotations.merged_df.head(2)

Unnamed: 0,filepath,pub_date,page_seq_num,edition_seq_num,batch,lccn,box,score,ocr,place_of_publication,geographic_coverage,name,publisher,url,page_url,created_at,id,lead_time,result,data
0,iahi_gastly_ver01/data/sn82015737/00279529091/...,1860-03-09,447,1,iahi_gastly_ver01,sn82015737,"[Decimal('0.30762831315880534'), Decimal('0.04...",0.950152,"['JTO', 'TMCE', 'An', 't%E', '3eott', 'County'...","Davenport, Iowa",['Iowa--Scott--Davenport'],Daily Democrat and news. [volume],"Maguire, Richardson & Co.",https://news-navigator.labs.loc.gov/data/iahi_...,https://chroniclingamerica.loc.gov/data/batche...,1602237486,iahi_gastly_ver01/data/sn82015737/00279529091/...,0.838,text-only,iahi_gastly_ver01_data_sn82015737_00279529091_...
1,ohi_cobweb_ver04/data/sn85026050/00280775848/1...,1860-08-17,359,1,ohi_cobweb_ver04,sn85026050,"[Decimal('0.5799164973813336'), Decimal('0.730...",0.985859,"['9', 'BI.', 'I', '.QJtf', 'A', 'never', 'fall...","Fremont, Sandusky County [Ohio]",['Ohio--Sandusky--Fremont'],Fremont journal. [volume],I.W. Booth,https://news-navigator.labs.loc.gov/data/ohi_c...,https://chroniclingamerica.loc.gov/data/batche...,1602236992,ohi_cobweb_ver04/data/sn85026050/00280775848/1...,7.593,illustrations,ohi_cobweb_ver04_data_sn85026050_00280775848_1...


In [None]:
annotations.export_merged('testmerge.csv')

In [None]:
#hide 
Path('testmerge.csv').unlink()

In [None]:
annotations = nnAnnotations.from_completions('../ph/ads/ad_annotations/', 'classification')
annotations.annotation_df.head(2)

Unnamed: 0,created_at,id,lead_time,result,data
0,1602236711,379001,1.248,text-only,pst_fenske_ver02_data_sn84026497_00280776129_1...
1,1602237071,396001,0.87,text-only,scu_carlacox_ver01_data_sn84026965_00294551268...


In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_sample.ipynb.
Converted 02_annotate.ipynb.
Converted index.ipynb.


In [None]:
# def process_completions(path, kind, drop_dupes=True):
#     df = load_completions(path)
#     df = df.reset_index(drop=True) # add index
#     df['data']= df['data'].map(lambda x: x['image'])
#     df['data'] = df['data'].map(lambda x: x.split('?')[0])
#     df['data'] = df['data'].apply(lambda x: Path(x).name)
#     if any(df['data'].str.contains('-')): # removes labelstudio hash from data loaded via web interface
#         df['data'] = df['data'].str.split('-',expand=True)[1]
#     if drop_dupes:
#         df = df.drop_duplicates(subset='data',keep='last')
#     if kind=='classification':
#         empty_rows = df[df['result'].apply(lambda x:len(x)==0)].index
#         df = df.drop(empty_rows)
#         df['result'] = df['result'].map(lambda x: x[0])
#     if kind=='label':
#         df['result'] = df['result'].map(lambda x: "|".join(map(str,x)) if len(x) >=1 else x)
#         df['result'] = df['result'].map(lambda x:"" if len(x)==0 else x)
#     return df

In [None]:
#hide
# #old
# def _process_completions(path, kind):
#     df = load_completions(path)
#     df['data']= df['data'].map(lambda x: x['image'])
#     df['data'] = df['data'].map(lambda x: x.split('?')[0])
#     df['data'] = df['data'].apply(lambda x: Path(x).name)
#     if kind=='classification':
#         df['value.choices'] = df['value.choices'].map(lambda x: x[0])
#     if kind=='label':
#         #df['value.choices'] = df['value.choices'].map(lambda x: "|".join(x))
#         df['result'].map(lambda x: "|".join(map(str,x)) if len(x) >=1 else x)
#     return df

In [None]:
#old
# def _load_df(json_file):
#     with open(json_file) as f:
#         data = json.load(f)
#         df = json_normalize(data,record_path=['completions','result'],meta=['data'])
#         return df