# Counting Boats from Space (Part 1): Annotate Chips with Boat Counts and Superintendent

In [None]:
%reload_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv
%matplotlib inline

In [None]:
import os 
import sys
import glob
import json
from pathlib import Path
import pathlib

sys.path.insert(0,os.path.dirname('../src/'))
from preprocess import request_save_cubes
from annotation_utils import display_image_and_references

import pandas as pd
import superintendent

### 1.0 Define some variables

In [None]:
aoi_json_filepath = "../data/aoi.json"
data_chips_dir = "/home/jovyan/data/chips/"

start_date_2019 = '2019-01-01'
end_date_2019 = '2019-06-30'

start_date_2020 = '2020-01-01'
end_date_2020 = '2020-06-30'

### 1.1 If we want to download new location in aoi.json, request and download images

In [None]:
with open(aoi_json_filepath, "r") as f: 
    aoi_file = json.load(f)
    straits_dict = aoi_file['Misc']
    for aoi_name, aoi_locs in straits_dict.items():
        for loc in aoi_locs:
            lat, lon = loc[0], loc[1]
            subdir = 'lat_{}_lon_{}'.format(str(lat).replace('.','_'), str(lon).replace('.','_'))
            # 2019 
            if not Path(data_chips_dir, subdir).exists() or len(list(Path(data_chips_dir, subdir).glob("*2019*.png"))) == 0:
                print(f"aoi name: {aoi_name}, subdir {subdir}, download 2019 images")
                request_save_cubes(start_date_2019, end_date_2019, lat, lon)
            # 2020
            if not Path(data_chips_dir, subdir).exists() or len(list(Path(data_chips_dir, subdir).glob("*2020*.png"))) == 0:
                print(f"aoi name: {aoi_name}, subdir {subdir}, download 2020 images")
                request_save_cubes(start_date_2020, end_date_2020, lat, lon)    

### 1.2 extract images to be labeled

In [None]:
stratis_images = []
with open(aoi_json_filepath, "r") as f: 
    aoi_file = json.load(f)
    straits_dict = aoi_file['Misc']
    for aoi_name, aoi_locs in straits_dict.items():
        for loc in aoi_locs:
            lat, lon = loc[0], loc[1]
            subdir = 'lat_{}_lon_{}'.format(str(lat).replace('.','_'), str(lon).replace('.','_'))
            if Path(data_chips_dir, subdir).exists():
                stratis_images.extend(Path(data_chips_dir, subdir).glob("img_ndwi*.png"))
print(f"In total we have {len(stratis_images)} images (img_ndwi) under {data_chips_dir} ")

In [None]:
## load the labels.csv file 
csv_file_path = "../data/labels_lean.csv"
labels_df = pd.read_csv(csv_file_path)
len(labels_df)

In [None]:
# define the images to be labeled 
tobe_labeled_images = []
for dir_ in stratis_images:
    timestamp = dir_.stem.split('t_')[1]
    lat_lon = dir_.parts[-2]
    lat_lon_t_df = labels_df[(labels_df["lat_lon"] == lat_lon) & (labels_df["timestamp"] == timestamp)]
    if len(lat_lon_t_df) == 0 or null_labels_df:
        tobe_labeled_images.append(dir_)

In [None]:
len(tobe_labeled_images)

In [None]:
# load superintendent and labelling
labeller = superintendent.ClassLabeller(
    features=tobe_labeled_images,
    options=[i for i in range(-1, 6)],
    display_func=display_image_and_references,
)

labeller

In [None]:
tobe_labeled_images
labeller.new_labels
print(len(tobe_labeled_images))

In [None]:
for i in range(len(tobe_labeled_images)):
    timestamp = tobe_labeled_images[i].stem.split('t_')[1]
    lat_lon = tobe_labeled_images[i].parts[-2]
    count = labeller.new_labels[i]
    labels_df = labels_df.append({"lat_lon": lat_lon, "timestamp": timestamp, "count": count}, ignore_index=True)
print(f"Now we have {len(labels_df)} labeled images (img_ndwi)")   

In [None]:
labels_df = labels_df[labels_df["count"].notnull()]

In [None]:
labels_df['count'] = labels_df['count'].astype(float)

In [None]:
labels_df['count'].hist()

In [None]:
# see how imbalance of the data is
len(labels_df[labels_df['count'] > 0])/len(labels_df)

### 1.3 Dump it to labels.csv file back

In [None]:
labels_df.to_csv(csv_file_path)

 ### Deprecated from here: From labels.csv to labels_lean.csv

In [None]:
labels_filename = "../data/labels.csv"
df_labels = pd.read_csv(labels_filename, index_col = 'file_path', dtype={'count': float}) 
for index, row in df_labels.iterrows():
    file_path = Path(index)
    timestamp = file_path.stem.split('t_')[1]
    lat_lon = file_path.parts[-2]
    df_labels.at[index, "timestamp"] = timestamp
    df_labels.at[index, "lat_lon"] = lat_lon

### Sanity check if all lat_lon under data/chips is in the df. 

In [None]:
data_dir = "/home/jovyan/data/chips"
coords = os.listdir(data_dir)
coords_in_df = df_labels.reset_index().groupby('lat_lon').groups.keys()
for coord in coords:
    if not coord.startswith("."):
        assert coord in coords_in_df

In [None]:
df_labels

In [None]:
df_labels.reset_index(inplace=True)

In [None]:
df_labels_lean = df_labels[['lat_lon', 'timestamp', 'count']]
df_labels_lean

In [None]:
labels_lean_file = '../data/labels_lean.csv'
df_labels_lean.to_csv(labels_lean_file)