In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nbdev import *

In [None]:
# TODO create inference dataframe for testing inference code

In [None]:
#default_exp inference

## Imports

In [None]:
#export
import ijson
import pkg_resources
import pandas as pd
from cytoolz import itertoolz
from tqdm.notebook import tqdm

In [None]:
from nnanno.core import *
from nnanno.sample import *

In [None]:
#export
from typing import (
    Any,
    Optional,
    Union,
    Dict,
    List,
    Tuple,
    Set,
    Iterable,
)
from PIL import Image
import PIL

In [None]:
#export 
from fastai.vision.all import *

## Learner

In [None]:
dls = ImageDataLoaders.from_csv('../ph/ads/', 'ads_upsampled.csv',folder='images', fn_col='file', label_col='label',item_tfms=Resize(64,ResizeMethod.Squish))

In [None]:
dls.show_batch()

In [None]:
learn = cnn_learner(dls, resnet18, metrics=F1Score())
learn.fine_tune(1)

# nnPredict

## helpers

### Missing images
Because we are dealing with images requested via the web we have to deal with the occasional hiccup. This hiccup could include requested image not being returned from an IIIF request, or a network issue etc. The method we use to load images is defined in `core`

In [None]:
?load_url_image

`load_url_image` will sometimes return `None`. When we're running inference this can cause an issue because we want to create batches of images to speed up inference. We don't want to include `None`s in a batch of images to predict. To get around this we create a function which filters a batch of images and replaces `None` with a fake image. This function also returns the index of items which were originally `None`. This allows us to use this index of items which were None to replace any predictions made for dummy images with `np.nan`

In [None]:
# export
def _filter_replace_none_image(results:List[Optional[PIL.Image.Image]]):
    fakeim = Image.fromarray(244 * np.ones((250,250,3), np.uint8))
    results = L(results)
    none_image_index = results.argwhere(lambda x: x is None) # Gets the index for images which are none
    results[none_image_index] = fakeim # Replaces None with fakeim
    return results.items, none_image_index

In [None]:
# hide
# TODO replace get_image_files with test images
im_files = (get_image_files('../ph/ads/images'))[:8] 
results = list(map(PILImage.create,im_files))
results.append(None)
results = [None] + results
image_batch,none_image_index =_filter_replace_none_image(results)
assert len(results) == len(image_batch)
assert none_image_index.items == [0,9] # check indexes are at the start and end of list

In [None]:
url = 'https://news-navigator.labs.loc.gov/data/dlc_fiji_ver01/data/sn83030214/00175040936/1900102801/0519/001_0_99.jpg'
im = load_url_image(url);im

In [None]:
im_files = (get_image_files('../ph/ads/images'))[:4]
images = list(map(PILImage.create, im_files))
results = [None] + images 
images,index =_filter_replace_none_image(results)

image_batch = [np.array(im) for im in images]
test_data = learn.dls.test_dl(image_batch)
pred_tuple = learn.get_preds(dl=test_data, with_decoded=True)
pred_decoded = L(pred_tuple[2], use_list=True)
pred_tensor =  L(pred_tuple[0],use_list=True)
pred_decoded[index] = np.nan; pred_tensor[index] = np.nan
pred_decoded.items, pred_tensor.items

In [None]:
url = 'https://news-navigator.labs.loc.gov/data/dlc_fiji_ver01/data/sn83030214/00175040936/1900102801/0519/001_0_99.jpg'
im = load_url_image(url)
images = [im,im,im]
results = [None] + images 
images,index =_filter_replace_none_image(results)
image_batch = [np.array(im) for im in images]
list(map(np.shape,image_batch))

In [None]:
#export
def _create_pred_header(fname, dls=None):
    columns=[
            "filepath",
            "pub_date",
            "page_seq_num",
            "edition_seq_num",
            "batch",
            "lccn",
            "box",
            "score",
            "ocr",
            "place_of_publication",
            "geographic_coverage",
            "name",
            "publisher",
            "url",
            "page_url",
            "iiif_url",
            "pred_decoded"]
    if dls:
        columns = columns + (list(dls.vocab))
    return pd.DataFrame(columns=columns).to_csv(fname, index=None)

In [None]:
#hide
_create_pred_header('test_header.csv')
df = pd.read_csv('test_header.csv')
assert df.columns.to_list() == ['filepath', 'pub_date', 'page_seq_num', 'edition_seq_num', 'batch',
       'lccn', 'box', 'score', 'ocr', 'place_of_publication',
       'geographic_coverage', 'name', 'publisher', 'url', 'page_url',
       'iiif_url', 'pred_decoded']
Path('test_header.csv').unlink()
_create_pred_header('test_header.csv', dls=dls)
df = pd.read_csv('test_header.csv')
assert len(df[dls.vocab].columns) == dls.c
Path('test_header.csv').unlink()

In [None]:
def _create_year_csv(out_dir, year,kind,dls=None):
    fname = Path(f"{out_dir}/{year}_{kind}.csv")
    _create_pred_header(fname, dls)
    return fname

In [None]:
def _create_year_json(out_dir, year,kind, batch):
    fname = Path(f"{out_dir}/{year}_{kind}_{batch}.json")
    return fname

In [None]:
#hide
Path('test_csv').mkdir()
_create_year_csv('test_csv',1850,'ads')
assert Path('test_csv/1850_ads.csv').exists() == True
Path('test_csv/1850_ads.csv').unlink() ;Path('test_csv/').rmdir()

## Predict

In [None]:
# TODO how to save to json 

# TODO save to csv a bit more nicely 

In [None]:
# export
class nnPredict:
    def __init__(self, learner, try_gpu=True):
        self.learner = learner
        self.try_gpu = try_gpu
        self.population = pd.read_csv(pkg_resources.resource_stream('nnanno', 'data/all_year_counts.csv'), 
                                      index_col=0)
    def _get_year_sample_size(self, kind,year):
        return self.population[f"{kind}_count"][year]
    
    def predict_from_sample_df(self, sample_df,bs=16):
        # TODO docstring
        self.sample_df = sample_df
       # Path(out_dir).mkdir(exist_ok=True)
        if self.try_gpu:
            if torch.cuda.is_available:
                gpu = True
            else:
                gpu = False
        if gpu:
            self.learner.model = self.learner.model.cuda() 
        self.sample_df['iiif_url'] = self.sample_df.apply(lambda x: iiif_df_apply(x,size=(250,250)),axis=1)
        dfs = []
        splits = round(len(self.sample_df)/bs)
        for df in tqdm(np.array_split(sample_df, splits)):
            futures=[]
            for url in df['iiif_url'].to_list():
                with ThreadPoolExecutor() as e:
                    future = e.submit(load_url_image,url)
                    futures.append(future)
            results = [future.result() for future in futures]
            image_list, none_index = _filter_replace_none_image(results)
            im_as_arrays = [np.array(image) for image in image_list]
            if len(none_index) >0:
                        tqdm.write(f"{none_index} skipped")
            else:
                pass
            test_data = self.learner.dls.test_dl(im_as_arrays)
            if gpu:
                test_data.to('cuda') 
            with self.learner.no_bar():
                pred_tuple = self.learner.get_preds(dl=test_data, with_decoded=True)
            pred_decoded = L(pred_tuple[2], use_list=True)
            pred_tensor =  L(pred_tuple[0],use_list=None)
            pred_decoded[none_index] = np.nan; pred_tensor[none_index] = np.nan
            df["pred_decoded"] = pred_decoded.items
            df["pred_decoded"] = df['pred_decoded'].astype(float)
            # create an empty df column for each class in dls.vocab
            for c in dls.vocab:
                df[f'{c}_prob'] = ''
            # append the tensor predictions to the last `c` colomns of the df
            df.iloc[:,-dls.c:] = np.hsplit(pred_tensor.numpy(),dls.c) #split into columns
            #df.to_csv('test.csv', header=None, index=None, mode="a")
            dfs.append(df)
        return dfs


    def predict(
        self,
        kind: str,
        out_dir: str,
        bs: int = 32,
        sample_size: Union[int, float] = None,
        start_year: int = 1850,
        end_year: int = 1950,
        step: int = 1,
        year_sample:bool=True,
    ):
        if Path(out_dir).exists() and len(os.scandir(out_dir)) >=1:
            raise ValueError(f'{out_fn} already exists and is not empty')
        Path(out_dir).mkdir(exist_ok=True)
#         if sample_size and not year_sample:
#             if not type(sample_size) == int:
#                 raise ValueError(
#                     f"type{sample_size} is not an int. Fractions are only supported for sampling by year"
#                 )
#             sample_size = calc_year_from_total(sample_size, start_year, end_year, step)
        if self.try_gpu:
            if torch.cuda.is_available():
                gpu = True
                print('using gpu')
            else:
                gpu = False
        if gpu:
            self.learner.model = self.learner.model.cuda() 
        years = range(start_year, end_year + 1, step)
        total = self._get_year_sample_size(kind,years).sum()
        pbar = tqdm(years,total=total)
        for year in pbar:
            out_fn = _create_year_csv(out_dir,year,kind, dls)
            pbar.set_description(f"Predicting: {year}, total progress")
            if kind == ('ads' and int(year) >=1870) or (kind == 'headlines'):
                s = create_session()
            else:
                s = create_cached_session() 
            with s.get(get_json_url(year, kind), timeout=60) as r: 
                if r.from_cache:
                    tqdm.write('using cache')
                data = ijson.items(r.content, "item")
                # TODO add sample approach
                batches = itertoolz.partition_all(bs, iter(data))
                year_total = self._get_year_sample_size(kind,year)
                for i,batch in enumerate(tqdm(
                    batches, total=round(year_total//bs),leave=False, desc='Batch Progress')):
                    df = pd.DataFrame(batch)
                    df["iiif_url"] = df.apply(lambda x: iiif_df_apply(x), axis=1)
                    futures = []
                    workers = get_max_workers(df)
                    for iif_url in df["iiif_url"].values:
                        with concurrent.futures.ThreadPoolExecutor(workers) as e:
                            future = e.submit(load_url_image, iif_url)
                            futures.append(future)
                    results = [future.result() for future in futures]
                    image_list, none_index = _filter_replace_none_image(results)
                    im_as_arrays = [np.array(image) for image in image_list]
                    if len(none_index) >0:
                        tqdm.write(f"{none_index} skipped")
                    else:
                        pass
                    test_data = learn.dls.test_dl(im_as_arrays)
                    with self.learner.no_bar():
                        pred_tuple = self.learner.get_preds(dl=test_data, with_decoded=True)
                    pred_decoded = L(pred_tuple[2], use_list=True)
                    pred_tensor =  L(pred_tuple[0],use_list=None)
                    pred_decoded[none_index] = np.nan; pred_tensor[none_index] = np.nan
                    df["pred_decoded"] = pred_decoded.items
                    df["pred_decoded"] = df['pred_decoded'].astype(float)
                    # create an empty df column for each class in dls.vocab
                    for c in dls.vocab:
                        df[f'{c}_prob'] = ''
                    # append the tensor predictions to the last `c` colomns of the df
                    df.iloc[:,-dls.c:] = np.hsplit(pred_tensor.numpy(),dls.c) #split into columns
                    df.to_csv(out_fn, header=None, index=None, mode="a")
                    pbar.update(bs)

NameError: name 'Union' is not defined

In [None]:
sampler = nnSampler()

NameError: name 'nnSampler' is not defined

In [None]:
sample = sampler.create_sample(0.5, 'ads',1850,1870, step=1)

In [None]:
bytesto(sample.memory_usage(deep=True).sum(), 'g')

In [None]:
predictor = nnPredict(learn)

In [None]:
predictor.predict('ads','test',end_year=1850,step=1, bs=10)

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_sample.ipynb.
Converted 02_annotate.ipynb.
Converted 03_inference.ipynb.
Converted index.ipynb.
