In [None]:
from nbdev import *

In [None]:
#default_exp sample

# Sample

> Create samples from Newspaper navigator dataset

\ # TODO Intro to module 

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#export
from nnanno.core import *

In [None]:
#export
# TODO tidy imports

# sys 
import io
import shutil
import pkg_resources
from pathlib import Path
from datetime import datetime

# other
from tqdm.auto import trange, tqdm
import requests
import ijson
import functools
import math
from cytoolz import dicttoolz, itertoolz
import random
import json
from PIL import Image
import PIL
import concurrent.futures
import numpy as np
import itertools
from pandas import json_normalize
import pandas as pd
from functools import partial
import numpy as np

# typing
from typing import (
    Any,
    Optional,
    Union,
    Dict,
    List,
    Tuple,
    Set,
    Iterable,
)

In [None]:
#export
def get_json_url(year: Union[str,int], kind:str='photos') -> str:
    '''Returns url for the json data from news-navigator for given `year` and `kind`'''
    return f'https://news-navigator.labs.loc.gov/prepackaged/{year}_{kind}.json'

In [None]:
assert get_json_url(1860) == 'https://news-navigator.labs.loc.gov/prepackaged/1860_photos.json' 
assert get_json_url(1950) == 'https://news-navigator.labs.loc.gov/prepackaged/1950_photos.json' 
assert get_json_url(1950,'ads') == 'https://news-navigator.labs.loc.gov/prepackaged/1950_ads.json' 

In [None]:
#export
def load_json(url) -> Dict[str, Any]:
    """Returns loaded json from url

    Parameters:
    url (str): URL for news-navigator json file

    Returns:
    Dict: dictionary with data from input json url
    """
    with requests.get(url, timeout=2) as r:
        r.raise_for_status()
        return json.loads(r.content)

In [None]:
test_json = load_json('https://news-navigator.labs.loc.gov/prepackaged/1950_photos.json')
assert type(test_json[0]) == dict
assert type(test_json) == list

This works well for a smallish file but if we try this with the [1905_ads.json](https://news-navigator.labs.loc.gov/prepackaged/1910_ads.json) file which is ~3.3GB we will likely run out of memory. For example running 

```python
with requests.get('https://news-navigator.labs.loc.gov/prepackaged/1910_ads.json') as r:
    data = json.loads(r.content)
len(data)
```

on a Google Colab instance with 25GB of RAM causes a crash. 

In [None]:
#export
@functools.lru_cache(256)
def count_json_iter(url: str, session=None) -> int:
    """
    Returns count of objects in url json file using an iterator to avoid loading json          into memory

    Parameters:
    url (str): URL for news-navigator json file

    Returns:
    int: count of json objects in url
    """
    if not session:
        session = create_cached_session()
    with session.get(url, timeout=60) as r:
        r.raise_for_status()
        if r:
            objects = ijson.items(r.content, "item")
            count = itertoolz.count(iter(objects))
        else:
            count = np.nan
    return count

In [None]:
count_json_iter('https://news-navigator.labs.loc.gov/prepackaged/1850_photos.json')

22

In [None]:
url = 'https://news-navigator.labs.loc.gov/prepackaged/1850_photos.json'
assert type(count_json_iter(url)) == int
assert len(json.loads(requests.get(url).content)) == count_json_iter(url)

In [None]:
#export
@functools.lru_cache(256)
def get_year_size(year,kind):
    session = None
    dset_size = {}       
    url = get_json_url(year,kind)
    if kind == ('ads' or 'headlines') and int(year) >=1870:
        session = create_session()
    dset_size[str(year)] = count_json_iter(url, session)
    return dset_size

In [None]:
get_year_size(1850, 'photos')

{'1850': 22}

In [None]:
#export
@functools.lru_cache(512)
def get_year_sizes(kind,start=1850, end=1950, step=5):
    """
    Returns the sizes for json data files for `kind` between year `start` and `end`    
    with step size 'step'

    Parameters:
    kind (str): kind of image from news-navigator options:
    photos, illustrations, maps, comics, cartoons, headlines, ads

    Returns:
    Pandas.DataFrame: with data from input json url
    """
  #  dset_size = {}
    futures = []
    years = range(start,end+1,step)
    max_workers = get_max_workers(years)
    with tqdm(total=len(years)) as progress:
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            for year in years:
                future = executor.submit(get_year_size, year, kind)
                future.add_done_callback(lambda p: progress.update())
                futures.append(future)
        results = [future.result() for future in futures]
        dset_size = {k: v for d in results for k, v in d.items()}
    year_df = pd.DataFrame.from_dict(dset_size,orient='index',columns=[f'{kind}_count'])
    return year_df


Returns the year sizes for a given kind taking a step size `step`. For example to get the number of photos in the news-navigator dataset between 1850 and 1860 for every year:

In [None]:
%%time
get_year_sizes('photos',1850, 1855, step=1)

100%|██████████| 6/6 [00:00<00:00, 146.76it/s]CPU times: user 26.7 ms, sys: 21.3 ms, total: 48 ms
Wall time: 44.1 ms



Unnamed: 0,photos_count
1850,22
1851,20
1852,22
1853,45
1854,221
1855,17


In [None]:
assert len(get_year_sizes('photos',1850, 1860, step=1)) == 11
assert len(get_year_sizes('photos',1850,1860, step=2)) == 6

100%|██████████| 11/11 [00:00<00:00, 249.65it/s]
100%|██████████| 6/6 [00:00<00:00, 6180.21it/s]


In [None]:
#export
def get_all_year_sizes(start=1850, end=1950,step=1, save:bool=True):
    """
    Returns a dataframe with number of counts from year `start` to `end`
    """
    kinds = ['ads', 'photos', 'maps', 'illustrations', 'comics', 'cartoons','headlines']
    dfs = []
    for kind in tqdm(kinds):
        df = get_year_sizes(kind, start=start, end=end,step=step)
        dfs.append(df)
    df = pd.concat(dfs, axis=1)
    df['total'] = df.sum(axis=1)
    if save:
        df.to_csv('all_year_sizes.csv')
    return df

## Sampling 

### Streaming sampling

Since we want a subset of the Newspaper Navigator datasets which we can either work with for [annotation](!TODO add link) or for inference we want to create samples. Sampling in python can be complicated depending on the type of population you are working with and the properties your sample needs to have but usually we can do something fairly simple like. For example, if we want to sample from a selection of books we could do:

In [None]:
import random
books = ['War and Peace', 'Frankenstein', 'If They Come in the Morning']
random.sample(books, 1)

['War and Peace']

However, we run into a same problem as when trying to get the length of a json dataset which wouldn't fit into memory above. For example if we want to sample $k$ examples from one of our json files which we can't load into memory. To get around this we can use [Reservoir_sampling](https://en.wikipedia.org/wiki/Reservoir_sampling):

> Reservoir sampling is a family of randomized algorithms for choosing a simple random sample without replacement of k items from a population of unknown size n in a single pass over the items. The size of the population n is not known to the algorithm and is typically too large to fit all n items into main memory. The population is revealed to the algorithm over time, and the algorithm cannot look back at previous items. 



In [None]:
#export
def sample_stream(stream, k:int):
    """
    Return a random sample of k elements drawn without replacement from stream.
    Designed to be used when the elements of stream cannot easily fit into memory.
    """
    r = np.array(list(itertools.islice(stream, k)))
    for t, x in enumerate(stream, k + 1):
        i = np.random.randint(1, t + 1)

        if i <= k:
            r[i - 1] = x
    return r

Now we sample whilst only loading a small numer of items into memory at one time. This does come at some cost, mainly speed. There are faster ways of sampling from a stream but this isn't the main bottle neck for sampling in this case. 

In [None]:
sample_stream(range(1,100000), 5)

array([ 6692, 27245, 30762, 81424, 96297])

In [None]:
names = ['Karl Marx', 'Rosa Luxenburg', 'Raya Dunayevskaya', 'CLR James']
sample_stream(iter(names), 2)

array(['Karl Marx', 'CLR James'], dtype='<U14')

In [None]:
#hide
assert len(sample_stream(range(1,100),5)) == 5

In [None]:
#export
@functools.lru_cache(128)
def calc_frac_size(url,frac):
    "returns fraction size from a json stream"
    return round(count_json_iter(url)*frac)

In [None]:
#hide
url = get_json_url(1850)
assert calc_frac_size(url, 0.5)== 11 #22*0.5

In [None]:
#export
def calc_year_from_total(total,start,end,step):
    "Calculate size of a year sample based on a total sample size"
    return max(1,round(total/(((end-start)+1)/step)))

In [None]:
calc_year_from_total(10,1850, 18950,1)

1

In [None]:
#hide 
assert calc_year_from_total(10, 1850,1950,1) >=1 # test that a value is always returned 

In [None]:
#export
def reduce_df_memory(df):
    return df.astype(
            {"score": "float64",
                "page_seq_num": "int32",
                "batch": "category",
                "box":"object",
                "lccn": "category",
                "page_url": "category",
                "name": "category",
                "publisher": "category",
                "place_of_publication": "category",
                "edition_seq_num": "category"}
        )

In [None]:
# export
def sample_year(kind:str,sample_size:Union[int,float], year:int) ->np.array:
    url = get_json_url(year, kind)
    if type(sample_size) is float:
        sample_size = calc_frac_size(url, sample_size)
    if kind == ('ads' or 'headlines') and int(year) >=1870:
        session = create_session()
    else:
        session = create_cached_session()
    with session.get(get_json_url(year, kind)) as r:
        if r:
            try:
                data = ijson.items(r.content, "item")
                sample_data = sample_stream(iter(data), sample_size)
            except requests.exceptions.RequestException as e:
                sample_data = np.nan
    return sample_data

In [None]:
sample_year('photos', 1, 1850)

array([{'filepath': 'ohi_ingstad_ver01/data/sn85026051/00296027029/1850090701/0061/002_0_90.jpg', 'pub_date': '1850-09-07', 'page_seq_num': 61, 'edition_seq_num': 1, 'batch': 'ohi_ingstad_ver01', 'lccn': 'sn85026051', 'box': [Decimal('0.5714432636546668'), Decimal('0.6112432110852859'), Decimal('0.7022799185194107'), Decimal('0.7229990545061264')], 'score': Decimal('0.9038882255554199'), 'ocr': [',', 'COME', 'IN,', 'WE', 'CALL', 'YOU', '!'], 'place_of_publication': 'Fremont, Sandusky County, Ohio', 'geographic_coverage': ['Ohio--Sandusky--Fremont'], 'name': 'Fremont weekly freeman. [volume]', 'publisher': 'J.S. Fouke', 'url': 'https://news-navigator.labs.loc.gov/data/ohi_ingstad_ver01/data/sn85026051/00296027029/1850090701/0061/002_0_90.jpg', 'page_url': 'https://chroniclingamerica.loc.gov/data/batches/ohi_ingstad_ver01/data/sn85026051/00296027029/1850090701/0061.jp2'}],
      dtype=object)

In [None]:
#export
class nnSampler:
    def __init__(self):
        self.population = pd.read_csv(pkg_resources.resource_stream('nnanno', 'data/all_year_counts.csv'), 
                                      index_col=0)
    
    def __repr__(self):
        return (f'{self.__class__.__name__}')
        

    def create_sample(
        self,
        sample_size: Union[int, float],
        kind: str = "photos",
        start_year: int = 1850,
        end_year: int = 1950,
        step: int = 5,
        year_sample=True,
        save: bool = False,
        reduce_memory=True,
    ):
        if not year_sample:
            if not type(sample_size) == int:
                raise ValueError(
                    f"type{sample_size} is not an int. Fractions are only supported for sampling by year"
                )
            sample_size = calc_year_from_total(sample_size, start_year, end_year, step)
        futures = []
        years = range(start_year, end_year + 1, step)
        _year_sample = partial(sample_year, kind, sample_size)
        with tqdm(total=len(years)) as progress:
            with concurrent.futures.ThreadPoolExecutor(2) as executor:
                for year in years:
                    future = executor.submit(_year_sample, year)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)
        results = [future.result() for future in futures]
        df = pd.DataFrame.from_dict(list(itertoolz.concat(results)))

        if reduce_memory:
            df = reduce_df_memory(df)
        if save:
            df.to_json(f"{kind}_{start_year}_{end_year}_sample.json")
        self.sample = df
        return df

    def download_sample(
        self,
        out_dir,
        csv_name=None,
        df=None,
        original: bool = True,
        pct: int = None,
        size: tuple = None,
        preserve_asp_ratio: bool = True,
    ):
        if df is not None:
            self.download_df = df.copy(deep=True)
        else:
            try:
                self.download_df = self.sample.copy(deep=True)
            except AttributeError as E:
                print(
                    "You need to create a sample before downloading, or pass in a previously created "
                )
        self.download_df["iiif_url"] = self.download_df.apply(
            lambda x: iif_df_apply(
                x,
                original=original,
                pct=pct,
                size=size,
                preserve_asp_ratio=preserve_asp_ratio,
            ),
            axis=1,
        )
        self.download_df["download_image_path"] = self.download_df['filepath'].str.replace('/','_')
        
        if not Path(out_dir).exists():
            Path(out_dir).mkdir(parents=True)
        _download_image = lambda x: download_image(
            x.iiif_url, x.download_image_path, out_dir)
        with tqdm(total=len(self.download_df)) as progress:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = []
                for tuple_row in self.download_df.itertuples():
                    future = executor.submit(_download_image, tuple_row)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)
                del futures
        if csv_name is None:
            today = datetime.today()
            time_stamp = today.strftime("%Y_%d_%m_%H_%M")
            csv_name = f"{time_stamp}_{len(self.download_df)}_sample"
        #self.download_df.to_csv(f'{out_dir}/{csv_name}.csv')
        self.download_df.to_json(f'{out_dir}/{csv_name}.json') # TODO make sure to use json for saving outputs of samples

In [None]:
sampler = nnSampler()

In [None]:
sampler

nnSampler

In [None]:
sampler.population

Unnamed: 0,ads_count,photos_count,maps_count,illustrations_count,comics_count,cartoons_count,headlines_count,total
1850,8841,22,5,671,9,0,11243,20791
1851,10065,20,6,457,7,0,12262,22817
1852,8764,22,10,671,10,8,13524,23009
1853,11517,45,5,1106,88,1,13224,25986
1854,15050,221,15,732,11,3,15282,31314
...,...,...,...,...,...,...,...,...
1946,185139,5945,1857,1053,3280,861,68275,266410
1947,181223,4188,1750,1115,3630,797,57018,249721
1948,152987,4282,1359,1154,3031,624,43432,206869
1949,154510,6015,1888,1280,3356,634,42904,210587


In [None]:
df = sampler.create_sample(sample_size=10, kind='photos', start_year=1850,end_year=1855,reduce_memory=True)
df.head(5)

100%|██████████| 2/2 [00:00<00:00, 135.51it/s]


Unnamed: 0,filepath,pub_date,page_seq_num,edition_seq_num,batch,lccn,box,score,ocr,place_of_publication,geographic_coverage,name,publisher,url,page_url
0,msar_icydrop_ver05/data/sn83016872/00295878502...,1850-12-31,986,1,msar_icydrop_ver05,sn83016872,"[0.1624672249571918, 0.026467160391487225, 0.9...",0.963962,"[k'.'-vk'', t', :j, 1, !'V1, kkk, .0, :I, .Ii:...","Canton, Miss.",[Mississippi--Madison--Canton],The Mississippi Creole. [volume],M.N. Prewett,https://news-navigator.labs.loc.gov/data/msar_...,https://chroniclingamerica.loc.gov/data/batche...
1,ohi_ingstad_ver01/data/sn85026051/00296027029/...,1850-07-20,33,1,ohi_ingstad_ver01,sn85026051,"[0.3009427797781111, 0.6294158908847332, 0.433...",0.929614,"[L, -, COME, IN,, WE, CALL, YOU, !, .v';:]","Fremont, Sandusky County, Ohio",[Ohio--Sandusky--Fremont],Fremont weekly freeman. [volume],J.S. Fouke,https://news-navigator.labs.loc.gov/data/ohi_i...,https://chroniclingamerica.loc.gov/data/batche...
2,ohi_ingstad_ver01/data/sn85026051/00296027029/...,1850-08-24,53,1,ohi_ingstad_ver01,sn85026051,"[0.4334465613731971, 0.6177851558250381, 0.565...",0.979533,"[COME, IN,, WE, CALL, YOU!]","Fremont, Sandusky County, Ohio",[Ohio--Sandusky--Fremont],Fremont weekly freeman. [volume],J.S. Fouke,https://news-navigator.labs.loc.gov/data/ohi_i...,https://chroniclingamerica.loc.gov/data/batche...
3,ncu_hawk_ver02/data/sn84026472/00416156360/185...,1850-05-22,289,1,ncu_hawk_ver02,sn84026472,"[0.6732673909317263, 0.042179068056539225, 0.8...",0.914908,[],"Hillsborough, N.C.",[North Carolina--Orange--Hillsboro],The Hillsborough recorder. [volume],Dennis Heartt,https://news-navigator.labs.loc.gov/data/ncu_h...,https://chroniclingamerica.loc.gov/data/batche...
4,vtu_londonderry_ver01/data/sn84023252/00200296...,1850-03-09,247,1,vtu_londonderry_ver01,sn84023252,"[0.5134230714407026, 0.1490311215403323, 0.669...",0.922406,"[OU1, Dr., Jftcob, Townaond.]","St. Johnsbury, Vt.",[Vermont--Caledonia--Saint Johnsbury],The Caledonian. [volume],A.G. Chadwick,https://news-navigator.labs.loc.gov/data/vtu_l...,https://chroniclingamerica.loc.gov/data/batche...


### Downloading a sample 

In [None]:
sampler.create_sample(sample_size=10, kind='photos', start_year=1850,end_year=1860,reduce_memory=True)
sampler.download_sample('test_iif',pct=10)

100%|██████████| 3/3 [00:00<00:00, 160.74it/s]
100%|██████████| 30/30 [00:02<00:00, 10.51it/s]


In [None]:
#hide
sampler = nnSampler()
sampler.create_sample(1, step=50)
sampler.download_sample('test_iif',pct=5)
files = [f for f in Path('test_iif').iterdir() if f.suffix == '.jpg']
files
assert type(Image.open(files[0])) == PIL.JpegImagePlugin.JpegImageFile
shutil.rmtree('test_iif')

100%|██████████| 3/3 [00:00<00:00,  6.82it/s]
100%|██████████| 3/3 [00:01<00:00,  2.20it/s]


In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_sample.ipynb.
Converted 02_annotate.ipynb.
Converted index.ipynb.
