In [None]:
#hide
from nbdev import *


Bad key "text.kerning_factor" on line 4 in
/anaconda/envs/nnanno/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [None]:
#default_exp sample

# Sample

> Create samples from Newspaper navigator dataset

\ # TODO Intro to module 

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
from nnanno.core import *

In [None]:
#export
#TODO tidy imports
# sys
import io
import shutil
import pkg_resources
from pathlib import Path
from datetime import datetime

# other
from tqdm.auto import trange, tqdm
import requests
import ijson
import functools
import math
from cytoolz import dicttoolz, itertoolz
import random
import json
from PIL import Image

import concurrent.futures
import numpy as np
import itertools
from pandas import json_normalize
import pandas as pd
from functools import partial
import numpy as np
from fastcore.foundation import patch_to

In [None]:
#export

import PIL
from typing import (
    Any,
    Optional,
    Union,
    Dict,
    List,
    Tuple,
    Set,
    Iterable,
)

## Newspaper Navigator JSON files

In [None]:
#export
def get_json_url(year: Union[str,int], kind:str='photos') -> str:
    '''Returns url for the json data from news-navigator for given `year` and `kind`'''
    return f'https://news-navigator.labs.loc.gov/prepackaged/{year}_{kind}.json'

In [None]:
assert get_json_url(1860) == 'https://news-navigator.labs.loc.gov/prepackaged/1860_photos.json' 
assert get_json_url(1950) == 'https://news-navigator.labs.loc.gov/prepackaged/1950_photos.json' 
assert get_json_url(1950,'ads') == 'https://news-navigator.labs.loc.gov/prepackaged/1950_ads.json' 

In [None]:
#export
def load_json(url) -> Dict[str, Any]:
    """Returns json loaded from `url`"""
    with requests.get(url, timeout=2) as r:
        r.raise_for_status()
        return json.loads(r.content)

We can also test that this returns what we think inside the notebook. These tests are often hidden in the documentation but inside the notebook there will often be a cell below a function which includes some tests for the function which has just been defined. 

In [None]:
test_json = load_json('https://news-navigator.labs.loc.gov/prepackaged/1950_photos.json')
assert type(test_json[0]) == dict
assert type(test_json) == list

### Working with big JSON

This works well for a smallish file but if we try this with the [1905_ads.json](https://news-navigator.labs.loc.gov/prepackaged/1910_ads.json) file which is ~3.3GB we will likely run out of memory. For example running 

```python
with requests.get('https://news-navigator.labs.loc.gov/prepackaged/1910_ads.json') as r:
    data = json.loads(r.content)
len(data)
```

on a Google Colab instance with 25GB of RAM causes a crash. 

## Streaming JSON
One way to get around this would be to throw more RAM at the problem. However since we only want to sample the JSON and don't need to work with the whole dataset this seems a bit wasteful. Instead we'll `ijson` a Python library for streaming JSON.

We can see how this works for a url from newspaper navigator. If we create a request via Requests using `stream=True` to return a streaming version of the response. 

In [None]:
r = requests.get(get_json_url(1850, 'ads'), stream=True)

We can pass this response to `ijson`. In this case we just parse an item at a time. If the JSON is really big this might already be too much. `ijson` allows for much more granular parsing of JSON but for what we need, parsing by item is fine. We can see what the return of this looks like

In [None]:
objects = ijson.items(r.raw, "item")
objects

<_yajl2.items at 0x7fa844c2b8a0>

We get back something from `_yajl2` this is the underlying parser ijson is using. See `ijson` docs for more on available parsers. 

We can call next on this object to start iterating over it, one item at a time. If we look at the keys of the first response you'll see that this is one entry from the original JSON data. 

In [None]:
first = next(objects)
first.keys()

dict_keys(['filepath', 'pub_date', 'page_seq_num', 'edition_seq_num', 'batch', 'lccn', 'box', 'score', 'ocr', 'place_of_publication', 'geographic_coverage', 'name', 'publisher', 'url', 'page_url'])

In [None]:
r.close()

### Counting the size of the data 


- what 
- how
- lru_cache

In [None]:
#export
@functools.lru_cache(256)
def count_json_iter(url: str, session=None) -> int:
    """Returns count of objects in url json file using an iterator to avoid loading json into memory"""
    if not session:
        session = create_cached_session()
    with session.get(url, timeout=60) as r:
        r.raise_for_status()
        if r:
            objects = ijson.items(r.content, "item")
            count = itertoolz.count(iter(objects))
        else:
            count = np.nan
    return count

`count_json_iter` counts the length of a json file loaded via `url`. 

In [None]:
count_json_iter('https://news-navigator.labs.loc.gov/prepackaged/1850_photos.json')

22

In [None]:
url = 'https://news-navigator.labs.loc.gov/prepackaged/1850_photos.json'
assert type(count_json_iter(url)) == int
assert len(json.loads(requests.get(url).content)) == count_json_iter(url)

In [None]:
#export
@functools.lru_cache(256)
def get_year_size(year: Union[int,str], kind: str) -> dict:
    """returns size of a json dataset for a given year and kind
    results are cached
    Parameters
    ----------
    year : Union[int,str]
        year from newspaper navigator
    kind : str
        {'ads', 'photos', 'maps', 'illustrations', 'comics', 'cartoons', 'headlines'}
    Returns
    -------
    size :dict
        returns a dict with year as a key and size as value
    """
    session = None
    dset_size = {}
    url = get_json_url(year,kind)
    if kind == 'ads' and int(year) >=1870 or (kind == 'headlines'):
        session = create_session()
    dset_size[str(year)] = count_json_iter(url, session)
    return dset_size

In [None]:
get_year_size(1850, 'photos')

{'1850': 22}

In [None]:
#export
@functools.lru_cache(512)
def get_year_sizes(kind,start=1850, end=1950, step=5):
    """
    Returns the sizes for json data files for `kind` between year `start` and `end`
    with step size 'step'

    Parameters:
    kind (str): kind of image from news-navigator:
    {'ads', 'photos', 'maps', 'illustrations', 'comics', 'cartoons', 'headlines'}

    Returns:
    Pandas.DataFrame: holding data from input json url
    """
  #  dset_size = {}
    futures = []
    years = range(start,end+1,step)
    max_workers = get_max_workers(years)
    with tqdm(total=len(years)) as progress:
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            for year in years:
                future = executor.submit(get_year_size, year, kind)
                future.add_done_callback(lambda p: progress.update())
                futures.append(future)
        results = [future.result() for future in futures]
        dset_size = {k: v for d in results for k, v in d.items()}
    return pd.DataFrame.from_dict(
        dset_size, orient='index', columns=[f'{kind}_count']
    )

Returns the year sizes for a given kind taking a step size `step`. For example to get the number of photos in the news-navigator dataset between 1850 and 1860 for every year:

In [None]:
%%time
get_year_sizes('photos',1850, 1855, step=1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))


CPU times: user 46.7 ms, sys: 20 ms, total: 66.7 ms
Wall time: 439 ms


Unnamed: 0,photos_count
1850,22
1851,20
1852,22
1853,45
1854,221
1855,17


In [None]:
assert len(get_year_sizes('photos',1850, 1860, step=1)) == 11
assert len(get_year_sizes('photos',1850,1860, step=2)) == 6

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [None]:
#export
def get_all_year_sizes(start=1850, end=1950,step=1, save:bool=True):
    """
    Returns a dataframe with number of counts from year `start` to `end`
    """
    kinds = ['ads', 'photos', 'maps', 'illustrations',
                    'comics', 'cartoons', 'headlines']
    dfs = []
    for kind in tqdm(kinds):
        df = get_year_sizes(kind, start=start, end=end, step=step)
        dfs.append(df)
    df = pd.concat(dfs, axis=1)
    df['total'] = df.sum(axis=1)
    if save:
        df.to_csv('all_year_sizes.csv')
    return df

# Creating Samples

### Streaming sampling

Since we want a subset of the Newspaper Navigator datasets which we can either work with for [annotation](!TODO add link) or for inference we want to create samples. Sampling in python can be complicated depending on the type of population you are working with and the properties your sample needs to have but usually we can do something fairly simple like. For example, if we want to sample from a selection of books we could do:

In [None]:
import random
books = ['War and Peace', 'Frankenstein', 'If They Come in the Morning']
random.sample(books, 1)

['War and Peace']

However, we run into a same problem as when trying to get the length of a json dataset which wouldn't fit into memory above. For example if we want to sample $k$ examples from one of our json files which we can't load into memory. To get around this we can use [Reservoir_sampling](https://en.wikipedia.org/wiki/Reservoir_sampling):

> Reservoir sampling is a family of randomized algorithms for choosing a simple random sample without replacement of k items from a population of unknown size n in a single pass over the items. The size of the population n is not known to the algorithm and is typically too large to fit all n items into main memory. The population is revealed to the algorithm over time, and the algorithm cannot look back at previous items. 



In [None]:
#export
def sample_stream(stream, k:int):
    """
    Return a random sample of k elements drawn without replacement from stream.
    Designed to be used when the elements of stream cannot easily fit into memory.
    """
    r = np.array(list(itertools.islice(stream, k)))
    for t, x in enumerate(stream, k + 1):
        i = np.random.randint(1, t + 1)
        if i <= k:
            r[i - 1] = x
    return r

Now we sample whilst only loading a small numer of items into memory at one time. This does come at some cost, mainly speed. There are faster ways of sampling from a stream but this isn't the main bottle neck for sampling in this case. 

In [None]:
sample_stream(range(1,100000), 5)

array([23996, 47272, 51356, 96868, 50827])

In [None]:
names = ['Karl Marx', 'Rosa Luxenburg', 'Raya Dunayevskaya', 'CLR James']
sample_stream(iter(names), 2)

array(['Raya Dunayevsk', 'CLR James'], dtype='<U14')

In [None]:
#hide
assert len(sample_stream(range(1,100),5)) == 5

In [None]:
#export
@functools.lru_cache(1024)
def calc_frac_size(url,frac, session=None):
    "returns fraction size from a json stream"
    return round(count_json_iter(url,session)*frac)

In [None]:
#hide
url = get_json_url(1850)
assert calc_frac_size(url, 0.5)== 11 #22*0.5

In [None]:
#export
def calc_year_from_total(total,start,end,step):
    "Calculate size of a year sample based on a total sample size"
    return max(1, round(total/(((end-start)+1)/step)))

In [None]:
calc_year_from_total(10,1850, 18950,1)

1

In [None]:
#hide 
assert calc_year_from_total(10, 1850,1950,1) >=1 # test that a value is always returned 

In [None]:
#export
def reduce_df_memory(df):
    return df.astype(
            {"score": "float64",
                "page_seq_num": "int32",
                "batch": "category",
                "box":"object",
                "lccn": "category",
                "page_url": "category",
                "name": "category",
                "publisher": "category",
                "place_of_publication": "category",
                "edition_seq_num": "category"}
        )

In [None]:
#export
class nnSampler:
    """
    Sampler for creating samples from Newspaper Navigator data
    """
    population = pd.read_csv(pkg_resources.resource_stream('nnanno', 'data/all_year_counts.csv'), index_col=0)

    def __repr__(self):
        return (f'{self.__class__.__name__}')

In [None]:
#export
def sample_year(kind:str, sample_size:Union[int,float], year:int) ->np.array:
    url = get_json_url(year, kind)
    if kind == 'ads' and int(year) >=1870 or (kind == 'headlines'):
        session = create_session()
    else:
        session = create_cached_session()
    if type(sample_size) is float:
        sample_size = max(1,calc_frac_size(url, sample_size, session))
        if kind == 'ads' and int(year) >=1870 or (kind == 'headlines'):
            session = create_session()
        else:
            session = create_cached_session()
    with session.get(get_json_url(year, kind)) as r:
        if r:
            try:
                data = ijson.items(r.content, "item")
                sample_data = sample_stream(iter(data), sample_size)
            except requests.exceptions.RequestException as e:
                sample_data = np.nan
        return sample_data

In [None]:
sample_year('photos', 1, 1850)
assert len(sample_year('maps', 0.1, 1850)) == 1 # test we always have a sample size of at least one 

In [None]:
#df = sample_year('ads',0.1,1920)

In [None]:
#export
@patch_to(nnSampler)
def create_sample(self,
                  sample_size: Union[int, float],
                  kind: str = "photos",
                  start_year: int = 1850,
                  end_year: int = 1950,
                  step: int = 5,
                  year_sample=True,
                  save: bool = False,
                  reduce_memory=True):
                """
                Creates a sample of Newspaper Navigator data for a given set of years and a kind

                Parameters:
                sample_size: int, float
                    `sample size` can either be a fixed number or a fraction of the total dataset size
                kind (str): kind of image from news-navigator:
                {'ads', 'photos', 'maps', 'illustrations', 'comics', 'cartoons', 'headlines'}


                Returns:
                Pandas.DataFrame: holding data from input json url
                """

                if not year_sample:
                    if type(sample_size) != int:
                        raise ValueError(
                            f"""type{sample_size} is not an int. Fractions are only supported
                            for sampling by year"""
                        )
                    sample_size = calc_year_from_total(sample_size, start_year, end_year, step)
                futures = []
                years = range(start_year, end_year + 1, step)
                _year_sample = partial(sample_year, kind, sample_size)
                with tqdm(total=len(years)) as progress:
                    workers = get_max_workers(years)
                    with concurrent.futures.ThreadPoolExecutor(1) as executor:
                        for year in years:
                            future = executor.submit(_year_sample, year)
                            future.add_done_callback(lambda p: progress.update())
                            futures.append(future)
                results = [future.result() for future in futures]
                df = pd.DataFrame.from_dict(list(itertoolz.concat(results)))

                if reduce_memory:
                    df = reduce_df_memory(df)
                if save:
                    df.to_json(f"{kind}_{start_year}_{end_year}_sample.json")
                self.sample = df
                return df

In [None]:
#export
@patch_to(nnSampler)
def download_sample(self,
            out_dir: str,
            json_name: Optional[str]=None,
            df: Optional[pd.DataFrame] = None,
            original: bool = True,
            pct: Optional[int] = None,
            size: Optional[tuple] = None,
            preserve_asp_ratio: bool = True) -> Union[None]:
            """Download images associated with a sample
            The majority of paramters relate to the options available in a IIIF image request
            see `https://iiif.io/api/image/3.0/#4-image-requests` for further information

            Parameters
            ----------
            out_dir
                The save directory for the images
            json_name

            df
                optional DataFrame containing a sample
            original
                if `True` will download orginal size images via IIIF
            pct
                optional value which scales the size of images requested by `pct`
            size
                a tuple representing `width` by `height`, will be passed to IIIF request
            preserve_asp_ratio
                whether to ask the IIIF request to preserve aspect ratio of image or not

            Returns
            -------
            None
                Nothing is returned by a

            """

            if df is not None:
                self.download_df = df.copy(deep=True)
            else:
                try:
                    self.download_df = self.sample.copy(deep=True)
                except AttributeError as E:
                    print(
                        "You need to create a sample before downloading, or pass in a previously created "
                    )
            self.download_df["iiif_url"] = self.download_df.apply(
                lambda x: iiif_df_apply(
                    x,
                     original=original,
                     pct=pct,
                     size=size,
                     preserve_asp_ratio=preserve_asp_ratio,
                ),
                axis=1,
            )
            self.download_df["download_image_path"] = self.download_df['filepath'].str.replace('/','_')

            if not Path(out_dir).exists():
                Path(out_dir).mkdir(parents=True)
            _download_image = lambda x: download_image(
                x.iiif_url, x.download_image_path, out_dir)
            with tqdm(total=len(self.download_df)) as progress:
                workers = get_max_workers(self.download_df)
                with concurrent.futures.ThreadPoolExecutor(workers) as executor:
                    futures = []
                    for tuple_row in self.download_df.itertuples():
                        future = executor.submit(_download_image, tuple_row)
                        future.add_done_callback(lambda p: progress.update())
                        futures.append(future)
                    del futures
            if json_name is None:
                today = datetime.today()
                time_stamp = today.strftime("%Y_%d_%m_%H_%M")
                json_name = f"{time_stamp}_{len(self.download_df)}_sample"
            self.download_df.to_json(f'{out_dir}/{json_name}.json')

In [None]:
sampler = nnSampler()

In [None]:
sampler

nnSampler

In [None]:
sampler.population

Unnamed: 0,ads_count,photos_count,maps_count,illustrations_count,comics_count,cartoons_count,headlines_count,total
1850,8841,22,5,671,9,0,11243,20791
1851,10065,20,6,457,7,0,12262,22817
1852,8764,22,10,671,10,8,13524,23009
1853,11517,45,5,1106,88,1,13224,25986
1854,15050,221,15,732,11,3,15282,31314
...,...,...,...,...,...,...,...,...
1946,185139,5945,1857,1053,3280,861,68275,266410
1947,181223,4188,1750,1115,3630,797,57018,249721
1948,152987,4282,1359,1154,3031,624,43432,206869
1949,154510,6015,1888,1280,3356,634,42904,210587


In [None]:
df = sampler.create_sample(sample_size=10, kind='photos', start_year=1850,end_year=1855,reduce_memory=True)
df.head(5)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,filepath,pub_date,page_seq_num,edition_seq_num,batch,lccn,box,score,ocr,place_of_publication,geographic_coverage,name,publisher,url,page_url
0,ohi_ingstad_ver01/data/sn85026051/00296027029/...,1850-07-27,37,1,ohi_ingstad_ver01,sn85026051,"[0.29913574490319106, 0.622813938380955, 0.430...",0.980025,"[ht, I, ', Wll., ., III, tl, T, ., ""', ""', "", ...","Fremont, Sandusky County, Ohio",[Ohio--Sandusky--Fremont],Fremont weekly freeman. [volume],J.S. Fouke,https://news-navigator.labs.loc.gov/data/ohi_i...,https://chroniclingamerica.loc.gov/data/batche...
1,ohi_ingstad_ver01/data/sn85026051/00296027029/...,1850-07-20,33,1,ohi_ingstad_ver01,sn85026051,"[0.3009427797781111, 0.6294158908847332, 0.433...",0.929614,"[L, -, COME, IN,, WE, CALL, YOU, !, .v';:]","Fremont, Sandusky County, Ohio",[Ohio--Sandusky--Fremont],Fremont weekly freeman. [volume],J.S. Fouke,https://news-navigator.labs.loc.gov/data/ohi_i...,https://chroniclingamerica.loc.gov/data/batche...
2,ncu_hawk_ver02/data/sn84026472/00416156360/185...,1850-05-22,289,1,ncu_hawk_ver02,sn84026472,"[0.6732673909317263, 0.042179068056539225, 0.8...",0.914908,[],"Hillsborough, N.C.",[North Carolina--Orange--Hillsboro],The Hillsborough recorder. [volume],Dennis Heartt,https://news-navigator.labs.loc.gov/data/ncu_h...,https://chroniclingamerica.loc.gov/data/batche...
3,ohi_ingstad_ver01/data/sn85026051/00296027029/...,1850-12-07,115,1,ohi_ingstad_ver01,sn85026051,"[0.30707743987524494, 0.6473851770787806, 0.44...",0.984739,"[COME, IN,, WE, CALL, YOU!]","Fremont, Sandusky County, Ohio",[Ohio--Sandusky--Fremont],Fremont weekly freeman. [volume],J.S. Fouke,https://news-navigator.labs.loc.gov/data/ohi_i...,https://chroniclingamerica.loc.gov/data/batche...
4,ohi_ingstad_ver01/data/sn85026051/00296027029/...,1850-08-17,49,1,ohi_ingstad_ver01,sn85026051,"[0.2943367379610656, 0.6305186744386874, 0.426...",0.956234,"[COME, IN,, WE, CALL, YOU, !, o]","Fremont, Sandusky County, Ohio",[Ohio--Sandusky--Fremont],Fremont weekly freeman. [volume],J.S. Fouke,https://news-navigator.labs.loc.gov/data/ohi_i...,https://chroniclingamerica.loc.gov/data/batche...


### Downloading a sample 

In [None]:
sampler.create_sample(sample_size=10, kind='ads', start_year=1850,end_year=1850,reduce_memory=True)
sampler.download_sample('test')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [None]:
#hide
files = [f for f in Path('test/').iterdir()]; json_file = list(Path('test').glob('*.json')); df = pd.read_json(json_file[0])
assert len(df) == 10
# check iiif urls from df return at least some images 
iiif_url_load_results = map(load_url_image, df['iiif_url'])
assert any(type(result) == PIL.Image.Image for result in iiif_url_load_results) == True

In [None]:
#hide
#tidyup
files = [f for f in Path('test/').iterdir()]; list(map(Path.unlink, files))
Path('test').rmdir()

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_sample.ipynb.
Converted 02_annotate.ipynb.
Converted 03_inference.ipynb.
Converted index.ipynb.
