In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from tools.collection_tools import Collection
from pathlib import Path
from collections import defaultdict
import pandas as pd
import pickle
from tools.helpers import range_to_pagenumbers
from tools.annotation_env import AnnotationEnv
#from tools.document_tools import TXTProcessor, WebAnnoProcessor
from tools.book_tools import NPD, Book

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Process corpus

In [4]:
ROOT = Path("/deezy_datadrive/kaspar-playground/npd")
DATA = ROOT / 'Data'
IN_PATH = DATA / "Original"
OUT_PATH = DATA /  "Processed"
MODELS_PATH = ROOT / 'Models'

In [5]:
editions_all = pickle.load(open('../editions_all.pickle','rb'))
#editions_all

In [6]:
#selected_years = [int(p.name.split("_")[1]) for p in list(IN_PATH.glob('MPD_*'))]
selected_years = [1846,1851,1856,1860,1865,1870,1875,1880,1885,1890,1895,1900,1905,1910,1915,1920] #
editions_orig = {y:editions_all[y] for y in selected_years}
#editions = {y:editions_all[y] for y in selected_year}

In [7]:
editions = range_to_pagenumbers(editions_orig)

In [8]:
editions.keys()

dict_keys([1846, 1851, 1856, 1860, 1865, 1870, 1875, 1880, 1885, 1890, 1895, 1900, 1905, 1910, 1915, 1920])

## Export pages for a specific year

In [None]:
year = 1875
print(editions[year])

In [None]:
npd = NPD(year,IN_PATH,OUT_PATH,
          editions[year],
          verbosity=1)

In [None]:
anno = AnnotationEnv(year,IN_PATH,OUT_PATH,editions[year])
anno.page_annotation_export(10,10)

## Export pages at the Collection level

In [None]:
def stratified_random_subsample(editions_range,size=4):
    editions_pages = defaultdict(list)
    for y, y_l in editions_range.items():
        for r in y_l:
            pages = list(range(*r))
            randoms.shuffle(pages)
            editions_pages[y].extend(pages[:size])
    return editions_pages

editions_orig

In [None]:
collection = Collection(editions,IN_PATH,OUT_PATH)
collection.page_annotation_export()

In [None]:
# go to local machine
# cd /Users/kbeelen/Documents/LivingwithMachines/Lab1/NPD/INCEpTION
# scp -r kbeelen@13.69.59.34:/deezy_datadrive/kaspar-playground/npd/Data/Processed .
# scp -r /Users/kbeelen/Documents/LivingwithMachines/Lab1/NPD/INCEpTION/Processed kbeelen@13.69.59.34:/deezy_datadrive/kaspar-playground/npd/Data

## Export segments for a specific year

In [None]:
year = 1895
structure_model = '/deezy_datadrive/kaspar-playground/npd/Models/structure_tagger/best-model.pt'

In [None]:
type(pickle.load(open(OUT_PATH / "MPD_1895" / "lemmas_raw" / "lemmas_raw.pickle",'rb'))[0][1])

In [None]:
npd = NPD(year,IN_PATH,OUT_PATH,
          editions[year],
          verbosity=1)

In [None]:
anno = AnnotationEnv(year,IN_PATH,OUT_PATH,editions[year])

In [None]:
anno.segment_annotation_export(10,model_path=structure_model)

## Export segments at the Collection level

In [9]:
structure_model = '/deezy_datadrive/kaspar-playground/npd/Models/structure_tagger/best-model.pt'

In [10]:
!ls -la {structure_model}

-rwxrwxrwx 1 kbeelen kbeelen 230600349 Feb 24 15:56 /deezy_datadrive/kaspar-playground/npd/Models/structure_tagger/best-model.pt


In [11]:
collection = Collection(editions,IN_PATH,OUT_PATH)
collection.segment_annotation_export(size=10,model_path=structure_model)

In [None]:
# go to local machine
# cd /Users/kbeelen/Documents/LivingwithMachines/Lab1/NPD/INCEpTION
# scp -r kbeelen@13.69.59.34:/deezy_datadrive/kaspar-playground/npd/Data/Processed .
# scp -r /Users/kbeelen/Documents/LivingwithMachines/Lab1/NPD/INCEpTION/Processed kbeelen@13.69.59.34:/deezy_datadrive/kaspar-playground/npd/Data

## Prioritize section of NPDs

As the London newspaper is somewhat absent from the general sample, we oversample the first quarter of each collection

In [22]:
structure_model = '/deezy_datadrive/kaspar-playground/npd/Models/structure_tagger/best-model.pt'

In [12]:
collection = Collection(editions,IN_PATH,OUT_PATH)
collection.segment_annotation_export(size=5,
                                     prioritize=.25, # prioritize the first quarter of pages
                                     model_path=structure_model)

## Parse structure

In [59]:

year = 1900
structure_model = '/deezy_datadrive/kaspar-playground/npd/Models/structure_tagger/best-model.pt'
print(year)
anno = AnnotationEnv(year,IN_PATH,OUT_PATH,editions[year])
df = anno.extract_structure(structure_model_path=structure_model, override=False, assume_london=True)

1900
Structure already parsed, loading data...
Done loading data.


In [21]:
anno.to_csv()

Unnamed: 0,token_id,token,tag
0,0,London,LOC
1,1,London,LOCDESCRIPTION
2,2,SPORTING,TITLE
3,3,LIFE,TITLE
4,4,(,TITLE
5,5,with,NEWSPAPERDESCR
6,6,which,NEWSPAPERDESCR
7,7,is,NEWSPAPERDESCR
8,8,incorporated,NEWSPAPERDESCR
9,9,Bell,NEWSPAPERDESCR


In [46]:
level_1_tags = ["LOC","LOCDESCRIPTION"] # # define hierarchy, the first tag should be the start element
level_2_tags = ["TITLE","NEWSPAPERDESCR"] # 
ignore_tags = ['O','HEADER']

In [47]:
import numpy as np

In [48]:
df[df.tag.isin(['LOC'])].token

0              London
36128     ABERGAVENNY
36129               .
36130               —
36131               (
             ...     
249507        Wicklow
249508              .
249509              )
251407      GCUERNSEY
251408              .
Name: token, Length: 6273, dtype: object

In [49]:
df.iloc[2510:2520]

Unnamed: 0,token_id,token,tag
2510,618,of,NEWSPAPERDESCR
2511,619,the,NEWSPAPERDESCR
2512,620,domestic,NEWSPAPERDESCR
2513,621,garden,NEWSPAPERDESCR
2514,622,",",NEWSPAPERDESCR
2515,623,from,LOCDESCR
2516,624,the,LOCDESCR
2517,625,orchard,LOCDESCR
2518,626,",",LOCDESCR
2519,627,flower,LOCDESCR


In [52]:
df_content = df[~df.tag.isin(ignore_tags)]
df_content.reset_index(drop=True, inplace=True)
        
offs_level_2 = np.where(df_content.tag.isin(level_2_tags))[0]
# start of higher level entities
# iterator over the offsets in the lower level entities
# if the offset + one position is not at level two
# add it as an offset for level 1
# include the last element as final offset
offs_level_1 = [0] + [o for o in offs_level_2
                        if o+1 not in offs_level_2] + [df_content.shape[0]]

level_1_boundaries = [(offs_level_1[i],offs_level_1[i+1]) for i in range(len(offs_level_1)-1)]
level_1_dfs = [df_content.iloc[s:e] for s,e in level_1_boundaries]

# add a previous name variable
# otherwise, if there is no LOC 
# then there will be an empty string
# for the level_1_name variable
# in this case use the previously
# encountered variable name
previous_name = ''

self.status['lemmas_raw'] = [] # make sure ingest is properly done
hierarchy_dict = defaultdict(list)
        
for level_1_df in level_1_dfs:
            
    level_1_name = ' '.join(level_1_df[level_1_df.tag==level_1_tags[0]].token)
    
    if not level_1_name:
        level_1_name = previous_name
    
    df_lemma = level_1_df[level_1_df.tag.isin(level_2_tags)]
    
    offs_head = list(np.where(df_lemma.tag==level_2_tags[0])[0]) #+ [df_lemma.shape[0]] 
            
    level_2_offs = [o for o in offs_head
                                if o-1 not in offs_head] + [df_lemma.shape[0]]
            
    level_2_boundaries = [(level_2_offs[i],level_2_offs[i+1]) for i in range(len(level_2_offs)-1)]
            
    level_2_text = [[df_lemma.iloc[s].token_id,' '.join(df_lemma.iloc[s:e].token)] 
                                        for s,e in level_2_boundaries]
    
    hierarchy_dict[level_1_name] = level_2_text
    previous_name = level_1_name
            
    self.status['lemmas_raw'].extend(level_2_text)
        
with open(self._out_path / 'lemmas_raw' / 'lemmas_raw.pickle','wb') as out_pickle:
        pickle.dump(self.status['lemmas_raw'], out_pickle)
        
        self.hierarchy_dict = hierarchy_dict

In [54]:
level_1_boundaries

[(0, 2514),
 (2514, 36127),
 (36127, 36271),
 (36271, 36424),
 (36424, 36545),
 (36545, 36885),
 (36885, 37021),
 (37021, 37117),
 (37117, 37239),
 (37239, 37283),
 (37283, 37532),
 (37532, 37632),
 (37632, 37733),
 (37733, 37903),
 (37903, 37992),
 (37992, 38149),
 (38149, 38234),
 (38234, 38327),
 (38327, 38493),
 (38493, 38556),
 (38556, 38596),
 (38596, 38706),
 (38706, 38880),
 (38880, 38990),
 (38990, 39009),
 (39009, 39080),
 (39080, 39332),
 (39332, 39396),
 (39396, 39724),
 (39724, 39882),
 (39882, 40179),
 (40179, 40228),
 (40228, 40466),
 (40466, 40555),
 (40555, 40889),
 (40889, 41087),
 (41087, 41219),
 (41219, 41246),
 (41246, 41575),
 (41575, 42011),
 (42011, 42060),
 (42060, 42509),
 (42509, 42914),
 (42914, 43073),
 (43073, 43174),
 (43174, 43775),
 (43775, 43811),
 (43811, 44058),
 (44058, 44319),
 (44319, 44366),
 (44366, 44601),
 (44601, 44820),
 (44820, 44984),
 (44984, 45470),
 (45470, 45580),
 (45580, 45667),
 (45667, 45890),
 (45890, 45975),
 (45975, 46073),
 (4

In [58]:
hierarchy_dict.keys()

dict_keys(['London', 'ABERGAVENNY . — ( Monmouthshire . )', 'ABERTILLERY . — ( Monmouthshire . )', 'ABINGDON . — ( Berkshire . )', 'ACCRINGTON . — ( Lancashire . ) Accrington', 'ACTON . — ( Middlesex . )', 'ADLINGTON . — ( Lancashire . ) Adlington isa', 'ALCESTER . — ( Waruwickshire . )', 'ALDERSHOT . — ( Hampshire . )', 'ALFORD . — ( Lincilnshire . )', '. — ( Derbyshire . )', 'ALNWICK . — ( Northumberland . )', 'ALTON . — ( Hants . )', 'ALTRINCHAM . — ( Cheshire . )', 'AMBLESIDE . — ( Westmorland . )', 'AMPTHILL . — ( Bedfordshire . )', 'ANDOVER . — ( Hampshire . )', 'ANERLEY . — ( Surrey . )', 'APPLEBY . — ( Westmorland . )', 'ARMLEY . — ( Yorkshire . )', 'ARUNDEL . — ( Sussex . )', 'ASHBOURNE . — ( Derbyshire . )', 'ASHBURTON . — ( Devonshire . )', '. ( See Burton Guardian . ) ASHFORD . — ( Kent . )', '. — ( Lancashire . )', 'ATHERSTONE . — ( Waearwickshire . )', 'AYLESBURY . — ( Buckinghamshire . )', 'BACUP . — ( Lancashire . )', 'BAKEWELL . — ( Derbyshire . )', '. ( See Buaton . )

In [57]:
hierarchy_dict['London']

[[679,
  'ARCHITECT AND CONTRACT REPORTER . THIN Friday , 4d . Established 1869 . This journal is fully established as the recognised repre sentative of architects , civil engineers , and builders . The best writers obtainable contribute articles on their special subjects . and the works of all the leading architects of Great Britai are illustrated in it from week to week . Published by P . A . Gilbert Wood , Imperial Buildings , Ludgatc Circus , E . C . ( Advt . , p . 266 . ) [ C . Mitchell & Co . , Advertising Contractors'],
 [780,
  'ARMY AND NAVY GAZETTE . Saturday , 6d . Established January 7 , 1860 . Under the editorship of Sir W , H . Russell , LL . D . This journal advocates the interests of all ranks , and the accuracy of its Service details is generally acknowledged . Published at 3 , York Street , Covent Garden , W . C .'],
 [844,
  "ATHEN UM . Saturday , 3d . Established Jan . , 1828 Principles : Those of sound intellectual culture , as thebest and most direct means of mora

# Debug

In [None]:
import numpy as np

In [None]:
level_1_tags = ["LOC","LOCDESCRIPTION"] # define hierarchy, the first tag should be the start element
level_2_tags = ["TITLE","NEWSPAPERDESCR"]
ignore_tags = ['O','HEADER']

In [None]:
# remove all tags listed under ignore_tags
df_content = df[~df.tag.isin(ignore_tags)]
# reset index
df_content.reset_index(drop=True, inplace=True)

# start or lower level entities
offs_level_2 = np.where(df_content.tag.isin(level_2_tags))[0]
# start of higher level entities
# iterator over the offsets in the lower level entities
# if the offset + one position is not at level two
# add it as an offset for level 1
# include the last element as final offset
offs_level_1 = [o for o in offs_level_2
                     if o+1 not in offs_level_2] + [df_content.shape[0]] # changed this to plus
 


In [None]:
offs_level_2

In [None]:
level_1_boundaries = [(offs_level_1[i],offs_level_1[i+1]) for i in range(len(offs_level_1)-1)]
level_1_dfs = [df_content.iloc[s:e] for s,e in level_1_boundaries]

#self.status['lemmas_raw'] = [] # make sure ingest is properly done
hierarchy_dict = defaultdict(list)
        
for level_1_df in level_1_dfs:
            
    level_1_name = ' '.join(level_1_df[level_1_df.tag==level_1_tags[0]].token)
            
    df_lemma = level_1_df[level_1_df.tag.isin(level_2_tags)]
    
    offs_head = list(np.where(df_lemma.tag==level_2_tags[0])[0]) #+ [df_lemma.shape[0]] 
            
    level_2_offs = [o for o in offs_head
                             if o-1 not in offs_head] + [df_lemma.shape[0]]
            
    level_2_boundaries = [(level_2_offs[i],level_2_offs[i+1]) for i in range(len(level_2_offs)-1)]
            
    level_2_text = [[df_lemma.iloc[s].token_id,' '.join(df_lemma.iloc[s:e].token)] 
                                        for s,e in level_2_boundaries]
    
    hierarchy_dict[level_1_name] = level_2_text
            
    #self.status['lemmas_raw'].extend(level_2_text)

In [None]:
for k,vs in hierarchy_dict.items():
    print(k)
    print("-"*25)
    print()
    for v in vs:
        #print(v)
        print("\t\t"+ v[1])
        print()