In [1]:
import pandas as pd
import requests
import numpy as np
import json
from io import BytesIO
from PIL import Image

import os
from pathlib import Path

In [2]:
APP_URL = "http://127.0.0.1:5000/inference"
DATA_PATH = "../data/data.csv"

In [3]:
def get_prediction(img: np.array):
    resp = requests.post(
        APP_URL, 
        json={"image": img.tolist()},
    )
    return np.array(
        resp.json()["prediction"]
    )

In [4]:
def store_local(img, column, basename, base_dir='C:/Users/NPDan/Documents/GitHub/task_challenge/data',
                image_stem='images', mask_stem = 'masks', ftype='png'):
    """helper to store local image files for ease"""
    # determine image path
    dst_dir = Path(base_dir) / (image_stem if image_stem[:-1] in column else mask_stem)
    dst_path = dst_dir / '.'.join((basename, ftype))
    
    # verify directory and save
    if not dst_dir.is_dir():
        dst_dir.mkdir()
    img.save(str(dst_path))
    return str(dst_path)

In [5]:
download = True
urls = pd.read_csv("../data/data.csv")

# set up new df and cols
base_dir = Path('C:/Users/NPDan/Documents/GitHub/task_challenge/data')
json_path = base_dir / 'data_local.json'
local_cols = ['local_image','local_mask']

if download:
    # copy df, add cols
    urls_local = urls.copy()
    urls_local[local_cols] = None, None

    # loop over 
    for ix, row in urls.iterrows():
        for ic, col in enumerate(["image_url", "mask_url"]):
            # get response, load data
            resp = requests.get(row[col])
            img = Image.open(BytesIO(resp.content))

            # write local image, retrieve path
            basename = f'img_{ix:03}'
            dst_path = store_local(img, col, basename)

            # store path in updated df
            urls_local.iloc[ix, ic + len(row)] = dst_path

    # save locals to json
    urls_local.to_json(json_path)
else:
    urls_local = pd.read_json(json_path)

In [6]:
urls_local.head()

Unnamed: 0,mask_url,image_url,local_image,local_mask
0,https://d1h90vpqo1860x.cloudfront.net/ab53069c...,https://d1h90vpqo1860x.cloudfront.net/3e18e6da...,C:\Users\NPDan\Documents\GitHub\task_challenge...,C:\Users\NPDan\Documents\GitHub\task_challenge...
1,https://d1h90vpqo1860x.cloudfront.net/8728de7a...,https://d1h90vpqo1860x.cloudfront.net/5daefb3e...,C:\Users\NPDan\Documents\GitHub\task_challenge...,C:\Users\NPDan\Documents\GitHub\task_challenge...
2,https://d1h90vpqo1860x.cloudfront.net/baa75595...,https://d1h90vpqo1860x.cloudfront.net/74217c21...,C:\Users\NPDan\Documents\GitHub\task_challenge...,C:\Users\NPDan\Documents\GitHub\task_challenge...
3,https://d1h90vpqo1860x.cloudfront.net/2e1508f0...,https://d1h90vpqo1860x.cloudfront.net/eeb2ed67...,C:\Users\NPDan\Documents\GitHub\task_challenge...,C:\Users\NPDan\Documents\GitHub\task_challenge...
4,https://d1h90vpqo1860x.cloudfront.net/2b42cbd4...,https://d1h90vpqo1860x.cloudfront.net/ba7e2239...,C:\Users\NPDan\Documents\GitHub\task_challenge...,C:\Users\NPDan\Documents\GitHub\task_challenge...


In [7]:
# just to be sure, let's verify all files are there and accessible
for ix, row in urls_local.iterrows():
    for col in local_cols:
        fpath = Path(row[col])
        if not fpath.is_file():
            print(f'File not found' + ' - '.join(fpath.parts[-2:]))

#### Updates and data pruning
Data access looks good. Having looked at the images and masks, I noted a few observations:
1. Data to prune - There were 2 obvious instances where the segmented building was neither the central building nor entirely in frame. That seemed like a poor sample, so I tracked those two *image* names and we can purge them.
2. Contrast and brightness can vary significantly even within the same image due to shadows from low-sun angles and adjacent trees/structures. This seems like a good case for contrast / brightness / noise augmention
3. There's a variety of orientations within the dataset - some houses oblong rectangles and others very square, oriented in various angles (not just horizontal or vertical). Good argument to use some image rotation augmentation
4. Crop / Resize - There's not much variance in the scale of the images and many objects are close to the border of the image. I'm not sure that Cropping / Resizing is necessary, perhaps we can explore two different augmentation strategies and compare, but we might be fine with skipping this.

So, next steps...
Let's copy and update the DF to exclude these two bad image examples and store that result as the **clean** dataset. This is what we'll use to generate training, testing and validation sets.

In [8]:
bad_img_names = ['img_025.png', 'img_144.png']

In [9]:
image_dir = 'images'
clean_urls_local = urls_local.copy()
for bad_img in bad_img_names:
    bad_img_path = (base_dir / image_dir) / bad_img
    clean_urls_local.loc[clean_urls_local.local_image == str(bad_img_path), 'local_image'] = pd.NA
clean_urls_local.dropna(0, 'any', inplace=True)
len(clean_urls_local), len(urls_local)

(238, 240)

In [10]:
# looks good, let's store the result
json_path = base_dir / 'data_local_clean.json'
clean_urls_local.to_json(json_path)

Great, data exploration and pruning is done. Next, let's build a dataloader pipeline in another notebook.

In [16]:
# pep8 reminder
def foo(val:int=1, bar:bool=None) -> None:
    print('foo' * val)
    if bar:
        print('bar')

In [21]:
foo(2, True)

foofoo
bar
