In [11]:
!pip install yolov5 reverse_geocoder

Collecting yolov5
  Downloading yolov5-6.1.8-py36.py37.py38-none-any.whl (855 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m855.6/855.6 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting reverse_geocoder
  Downloading reverse_geocoder-1.5.1.tar.gz (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting thop>=0.1.1
  Downloading thop-0.1.1.post2207130030-py3-none-any.whl (15 kB)
Collecting boto3>=1.19.1
  Downloading boto3-1.24.58-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.5/132.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard>=2.4.1
  Downloading tensorboard-2.10.0-py3-none-any.whl (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Collecting ope

In [12]:
from google.cloud import firestore
from google.cloud import storage, aiplatform
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from yolov5 import detect
import sys
import yaml
import os
import shutil
import reverse_geocoder as rg
import random
from tqdm import tqdm
import collections
from datetime import datetime

# Initialize connections to cloud storage and database
# !gcloud auth login
# !gcloud config set account <>
# !gcloud config set project bsos-geog-harvest1
# !gcloud auth application-default login

  from .autonotebook import tqdm as notebook_tqdm


# Create Dataset Based On Crop And Region

**Author:** Madhava Paliyam (madhavapaliyam@gmail.com)

**Description:** Creates a dataset based on a crop or region that we want to improve in. This should be used after running notebook 4_analyze_predictions.ipynb so that a region/crop combinations that is performing poorly can be queried for additional images and labeled to improve the model. 



**Inputs**: Parameters for yolov5 detection, region to query, crop to query

**Outputs**: A dataset onto gs://street2sat-gcloud-labeling bucket with dataset and AI platform dataset. 

In [None]:
!dvc pull -q -f 

### 1. Sampling dataset

In [None]:
##### SET DATASET QUERY PARAMETERS HERE #####
query_for_crop = 'banana'   # choose from any of the available crops
country_code = 'KE'         # choose from country code : 'KE', 'UG', 'US'
images_to_search = 250
###############################

In [4]:
# read csv 
all_paths = pd.read_csv('gs://street2sat-database-csv/database-info.csv')
# we want images not already being labeled
all_paths = all_paths[all_paths['being_labeled'] == False]
# and also images that are not in test set 
all_paths = all_paths[all_paths['test_set'] == False]


  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
all_paths['cc'].unique()

array(['KE', nan, 'UG', 'US'], dtype=object)

In [6]:
# randomly sample images within country
potential_images = all_paths[all_paths['cc'] == country_code].sample(images_to_search)
potential_images.head()

Unnamed: 0.1,Unnamed: 0,input_img,latitude,longitude,being_labeled,country,admin1,admin2,cc,location,test_set,time,focal_length,pixel_height
71640,71640,gs://street2sat-uploaded/KENYA/2021_07_12_T2/1...,0.981461,35.202374,False,KENYA,Trans Nzoia,,KE,Kitale,False,2021-07-12 14:31:56+00:00,3,2028
124798,124798,gs://street2sat-uploaded/KENYA/2021_07_29_T2/1...,0.307944,34.556804,False,KENYA,Kakamega,,KE,Mumias,False,2021-07-29 15:02:29+00:00,3,2028
52906,52906,gs://street2sat-uploaded/KENYA/2021-08-03-T1/G...,-1.089449,35.912792,False,KENYA,Narok,,KE,Narok,False,2021-08-03 15:32:53+00:00,3,2028
94524,94524,gs://street2sat-uploaded/KENYA/2021_07_16_T2/1...,-0.023287,35.186382,False,KENYA,Nandi,,KE,Nandi Hills,False,2021-07-16 12:47:07+00:00,3,2028
38146,38146,gs://street2sat-uploaded/KENYA/2021-07-27-T1/G...,-0.608462,34.513704,False,KENYA,Homa Bay,,KE,Homa Bay,False,2021-07-27 13:46:47+00:00,3,2028


In [7]:
# open connection to google cloud 
client = storage.Client()
gcloud_uploaded_bucket_str = 'street2sat-uploaded'
gcloud_uploaded_bucket = client.bucket(gcloud_uploaded_bucket_str)


classes_dict = {}
with open('../street2sat_utils/crop_info/classes.txt') as classes_file: 
    for i, line in enumerate(classes_file):
        classes_dict[line.strip()] = i

In [9]:
import os 
# create directory to save images to 
save_dir = os.path.join(f'crop_{query_for_crop}_region_{country_code}')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
# download images 
for i,item in potential_images.iterrows(): 
    blob = gcloud_uploaded_bucket.blob(item['input_img'].replace('gs://street2sat-uploaded/', ''))
    blob.download_to_filename(os.path.join(save_dir, f"{i}.jpg"))
    

### 2. Run detect.py 

Configure conf_thres and iou_thres to adjust the amount/quality of bounding boxes returned. 

In [8]:
##### MODIFY THESE AS NEEDED #####
conf_thres = 0.05                  # confidence threshold
iou_thres = 0.05                   # NMS IOU threshold
weights= '/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'   # model.pt path(s)
imgsz= 640                         # inference size (height, width)
max_det = 1000                     # maximum detections per image
project = 'runs/detect'            # save results to project/name
name ='exp'                        # save results to project/name
######################################


# runs the detect script from yolov5 libary
to_parse = f"detect.py " \
            + f"--weights {weights} " \
            + f"--source {save_dir} " \
            + f"--imgsz {imgsz} " \
            + f"--conf-thres {conf_thres} " \
            + f"--iou-thres {iou_thres} " \
            + f"--classes {classes_dict[query_for_crop]} " \
            + f"--max-det {max_det} " \
            + f"--project {project} " \
            + f"--name {name} " \
            + f"--exist-ok " \
            + f"--save-crop " \
            + f"--save-txt " \
            + f"--save-conf " \

to_parse = to_parse.split()
sys.argv = to_parse 
detect.main()

[34m[1mdetect: [0mweights=['/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'], source=crop_banana_region_KE, imgsz=[640, 640], conf_thres=0.05, iou_thres=0.05, max_det=1000, device=, view_img=False, save_txt=True, save_conf=True, save_crop=True, nosave=False, classes=[2], agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=True, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False
INFO:yolov5.utils.torch_utils:YOLOv5 🚀 2022-2-16 torch 1.10.2+cu102 CUDA:0 (Tesla V100-PCIE-16GB, 16160.5MB)

INFO:models.yolo:Fusing layers... 
INFO:yolov5.utils.torch_utils:Model Summary: 369 layers, 20919810 parameters, 0 gradients, 48.2 GFLOPs


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


image 1/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/100295.jpg: 480x640 Done. (0.012s)
image 2/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/100441.jpg: 480x640 Done. (0.010s)
image 3/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/101347.jpg: 480x640 Done. (0.010s)
image 4/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/101705.jpg: 480x640 Done. (0.010s)
image 5/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/101715.jpg: 480x640 Done. (0.010s)
image 6/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/102031.jpg: 480x640 Done. (0.010s)
image 7/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/102191.jpg: 480x640 Done. (0.010s)
image 8/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_regio

### 3. Save Dataset to Google Cloud for labeling

In [9]:
# find images that have detections 
images_with_labels = [int(x.replace('.txt', '')) for x in os.listdir(os.path.join(project, name, 'labels'))]
potential_images.loc[images_with_labels]

Unnamed: 0.1,Unnamed: 0,input_img,latitude,longitude,being_labeled,...,location,test_set,time,focal_length,pixel_height
64082,64082,gs://street2sat-uploaded/KENYA/2021_07_10_T2/1...,0.785105,34.567473,False,...,Malikisi,False,2021-07-10 11:45:06+00:00,3,2028
26113,26113,gs://street2sat-uploaded/KENYA/2021-07-21-T1/G...,-0.756797,34.875908,False,...,Keroka,False,2021-07-21 14:01:34+00:00,3,2028
20345,20345,gs://street2sat-uploaded/KENYA/2021-07-17-T1/G...,-0.668167,34.750041,False,...,Kisii,False,2021-07-17 13:11:27+00:00,3,2028


In [8]:
# Generate CSV, find detected images in the directory 
# these contain the txt files of labels that detect.py found, if the class was not 
# present it will not be in this directory
gcloud_labeling_bucket_str = 'street2sat-gcloud-labeling'
images_with_labels = [int(x.replace('.txt', '')) for x in os.listdir(os.path.join(project, name, 'labels'))]

# filter the potential images to label by the ones which were detected 
images_of_interest = potential_images.loc[images_with_labels]

print(f"Found {len(images_of_interest)} images")

# create csv with each row having the image path from potential images 
amount_of_csvs_to_generate = 1
for i in tqdm(range(amount_of_csvs_to_generate), desc="CSV Generation"):
    images_to_label = images_of_interest
    csv_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S_") + f'crop_{query_for_crop}_region_{query_for_region}' + '.csv'
    print(f"Saving to {csv_name}")
    df = pd.DataFrame(images_to_label)
    df.to_csv(f"gs://{gcloud_labeling_bucket_str}/{csv_name}", sep="\n", index=False, header=False)

67


CSV Generation:   0%|          | 0/1 [00:00<?, ?it/s]

Saving to 2022-03-15_11-34-02_crop_banana_region_UG.csv


CSV Generation: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s]


In [9]:
# location of csv in google cloud 
f"gs://{gcloud_labeling_bucket_str}/{csv_name}"

'gs://street2sat-gcloud-labeling/2022-03-15_11-34-02_crop_banana_region_UG.csv'

In [10]:
# create dataset 
ds = aiplatform.ImageDataset.create(
        display_name=csv_name.split('.')[0],
        gcs_source=f"gs://{gcloud_labeling_bucket_str}/{csv_name}",
        import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
        sync=False,
    )

INFO:google.cloud.aiplatform.datasets.dataset:Creating ImageDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create ImageDataset backing LRO: projects/1012768714927/locations/us-central1/datasets/9121113057425096704/operations/4717991819122573312
