In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from annoy import AnnoyIndex
from tqdm.notebook import tqdm
from tools.record_tools import insert_annotations
from pigeon import annotate
import pickle


In [3]:
jisc_meta = pd.read_excel('./newspaper_metadata/JISC1+2CombinedTitleList_v1_15052020.xlsx', sheet_name='Titles')
bl_meta = pd.read_csv('./newspaper_metadata/BL newspaper holdings British and Irish titles v2a.csv')


In [4]:
columns = ['Title ID','First geographical subject heading']
meta_merged = jisc_meta.merge(bl_meta[columns],left_on='System ID',right_on='Title ID',how= 'left')
meta_merged['First geographical subject heading'].fillna('',inplace=True)
meta_merged.shape

(143, 14)

In [5]:
iteration = 0
years = [1846,1847,1851,1856,1858,
        1860,1863,1865,1868,
        1870,1873,1875,1878,
        1880,1883,1885,1888,
        1890,1893,1895,1898,
        1900,1905,1910,1915,1920]


# Begining of annotation: yearly loop

In [582]:
target_year = years[iteration]
print(target_year)

1900


In [583]:
if iteration:
    npd_data = pd.read_csv('./output_data/MPD_export_1846_1920_JISC.csv',index_col=0,dtype={'JISC':str})
else:
    npd_data = pd.read_csv('./output_data/MPD_export_1846_1920.csv',index_col=0)
    npd_data.to_csv('./output_data/MPD_export_1846_1920_JISC.csv')
npd_data['S-TITLE'].fillna('',inplace=True)
npd_data['DISTRICT'].fillna('',inplace=True)
npd_data.shape

(43361, 20)

In [584]:

meta_merged_sel = meta_merged[(meta_merged.Start_year <= target_year) & (meta_merged.End_year >=  target_year)]

npd_data_red = npd_data[npd_data.year==target_year]
npd_data_red.reset_index(inplace=True)


In [585]:
if iteration:
    observed_newspaper_ids = [int(eval(w)) for w in npd_data_red['JISC'].unique() if isinstance(w,str)]
    #print(observed_newspaper_ids)
    print(f"Alread linked {meta_merged_sel[meta_merged_sel['System ID'].isin(observed_newspaper_ids)].shape[0]}")
    meta_merged_sel = meta_merged_sel[~meta_merged_sel['System ID'].isin(observed_newspaper_ids)]
    meta_merged_sel.reset_index(inplace=True)
    print(f'Annotating {meta_merged_sel.shape[0]} more entries')


Alread linked 38
Annotating 11 more entries


In [586]:
meta_merged_sel['Newspaper Title']

0            Weekly Standard and Express, The
1                                  Daily News
2                      Courier and Argus, The
3     Hampshire Telegraph and Naval Chronicle
4                             Ipswich Journal
5               Isle of Man Weekly Times, The
6                    Jackson's Oxford Journal
7                           Morning Post, The
8                               Standard, The
9                      Star, The   (Guernsey)
10                          Wrexam Advertiser
Name: Newspaper Title, dtype: object

In [587]:
vectorizer = TfidfVectorizer(min_df=10,max_df=.9,analyzer='char', ngram_range=(1,4))
X = vectorizer.fit_transform(npd_data_red['S-TITLE'])

index = AnnoyIndex(X.shape[1], metric='euclidean')

for i,v in tqdm(enumerate(X)):
    index.add_item(i, v.todense().T)
print('Building index...')
index.build(50)
print('Done.')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Building index...
Done.


# Annotation: title loop

In [593]:

row_idx = 0
all_annotations = []
stop_at = meta_merged_sel.shape[0]

In [616]:

if row_idx >= stop_at:
    out_path = f'./link_dump/jisc_links/jisc_links_{target_year}.pickle'
    with open(out_path,'wb') as out_pickle:
        pickle.dump(all_annotations,out_pickle)
    df = insert_annotations(Path('./output_data/MPD_export_1846_1920_JISC.csv'),
                        Path(f'./link_dump/jisc_links/jisc_links_{target_year}.pickle'),col='JISC')
    iteration+=1
    print('Done.')
    print(f'Saving to {out_path}')
    print('Annotate next year')
else:
    print(f"at {row_idx+1} of {stop_at} newspapers")

    row = meta_merged_sel.iloc[row_idx]
    title = row["Newspaper Title"]
    norm_title = row["Normalised Title"]
    
    vector = vectorizer.transform([title+' '+norm_title]).todense().reshape(-1).T
    candidates = index.get_nns_by_vector(vector,5)
    annotations = annotate(
      [(c,row['System ID'],npd_data_red.iloc[c]['id'],npd_data_red.iloc[c]['chain_id']) for c in candidates],
        options=['same', 'different'],
        display_fn=lambda c :  print(row["Normalised Title"].upper(),'||',row['Newspaper Title'].upper(), "||" ,
                                        row['First geographical subject heading'],
                                        '\n--->\n', 
                                        npd_data_red.iloc[c[0]]['S-TITLE'], " || ",
                                        npd_data_red.iloc[c[0]]['DISTRICT'])
        )

Done.
Saving to ./link_dump/jisc_links/jisc_links_1900.pickle
Annotate next year


In [615]:
row_idx+=1
all_annotations.extend(annotations)
len(all_annotations)

55

# Check manually later


## Provincial

Freeman's Journal and Daily Commercial Advertiser: 1847 

BANER AC AMSERAU CYMRU: 1858, 1860, 1883,

WREXHAM ADVERTISER: 1858, 1860, 1863, 1865? 1900

IPSWICH JOURNAL: 1890, 1893?, 1895, 1898, 1900

HULL PACKET: 1883

NORTHERN STAR: 1846, 1847, 1851

GOLEUAD: 1883

YORK HERALD: 1888

## Other

GRAPHIC: 1873

THE ERA: 1846, 1851, 1856, 1858, 1860, 1865, 1880

ILLUSTRATED POLICE NEWS: 1868

MORNING CHRONICLE: 1863, 1865

MORNING POST: 1870, 1890, 1900

PALL MALL GAZETTE: 1865, 1890

ISLE OF MAN TIMES: 1870, 1873, 1875, 1878, 1880, 1885

GUERNSEY STAR: 1870, 1873, 1875, 1878, 1880, 1885, 1890, 1895, 1900

DAILY NEWS (London): 1890

PROTESTAN STANDARD: 1890






In [None]:
### Manually added
## Provincial

Freeman's Journal and Daily Commercial Advertiser: 1847 

BANER AC AMSERAU CYMRU: 1858, 1860, 1883,

WREXHAM ADVERTISER: 1858, 1860, 1863, 1865? 1900

IPSWICH JOURNAL: 1890, 1893?, 1895, 1898, 1900

HULL PACKET: ?1883

NORTHERN STAR: 1846, 1847, 1851

GOLEUAD: 1883

YORK HERALD: 1888

## Missing links

In [5]:
list(Path(f'./link_dump/jisc_links/').glob("*.pickle"))

[PosixPath('link_dump/jisc_links/jisc_links_1865.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1860.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1900.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1856.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1895.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1847.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1885.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1870.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1851.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1880.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1875.pickle'),
 PosixPath('link_dump/jisc_links/jisc_links_1890.pickle')]