# Check S3 Data

In [1]:
# Required imports:
import os
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "code"))

from aws_s3_client import *


DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), "data")
BUCKET = "w210-snow-fate"

s3 = S3Client(DATA_DIR, BUCKET)

## Get a report of the status of a gage's data in S3

In [2]:
gage_name = "11266500"
date_from = "2010_01_01"
date_to = "2010_12_31"

report = s3.gage_data_report(gage_name, date_from, date_to)

In [3]:
# See which files are missing for each satellite band:
{k: v["missing_dates"] for k, v in report.items()}

{'total_precipitation': set(), 'temperature_2m': set(), 'ET': set()}

## Get reports for all gages

In [4]:
date_from = "2010_01_01"
date_to = "2020_12_31"
gages = [11185500, 11189500, 11202710, 11208000, 11266500, 11318500, 11402000]
reports = dict()
for gage_name in gages:
    report = s3.gage_data_report(str(gage_name), date_from, date_to)
    reports[str(gage_name)] = report

## Identify missing data in S3

In [5]:
missing_dates = dict()
for gage, report in reports.items():
    for band, data in report.items():
        if data["missing_dates"]:
            missing_dates[(gage, band)] = data

In [6]:
missing_dates.keys()

dict_keys([])

In [7]:
for k, v in missing_dates.items():
    print(k)
    print(min(missing_dates[k]["missing_dates"]))
    print(max(missing_dates[k]["missing_dates"]))
    print()

## Identify duplicate images in S3

In [8]:
has_dupes = list()
for gage, values in reports.items():
    for band, data in values.items():
        if data["has_dupes"]:
            has_dupes.append((gage, band))
has_dupes

[]

## Delete duplicate images on S3

In [9]:
import numpy as np

for gage, band in has_dupes:
    
    dupes = reports[gage][band]["dupes"]
    
    files = s3.list_gee_tif_files(gage)
    
    dupe_files = files[
        (files["band"] == band) &
        (files["date"].isin(dupes.index))
    ]
    dupe_files = dupe_files.sort_values(by=["satellite", "band", "date", "size", "last_modified_date"], 
                                        ascending=[True, True, True, False, False]).reset_index(drop=True)
    dupe_files["delete"] = np.where(
        (dupe_files["satellite"] == dupe_files.shift(1)["satellite"]) & 
        (dupe_files["band"] == dupe_files.shift(1)["band"]) & 
        (dupe_files["date"] == dupe_files.shift(1)["date"]) &
        (dupe_files["scale"] == dupe_files.shift(1)["scale"]),
        1, 0)
    assert dupe_files["delete"].sum() <= len(dupe_files) / 2
    to_delete = dupe_files[dupe_files["delete"] == 1]["filepath"].values
    
    s3.delete_s3_file(*to_delete)