The goal of this ipynb notebook is to confirm whether all of the SDC files for a certain city have been downloaded. The inputs will be the longitude and latitude of the city we're interested in, the year range of the data we are interested in, and the path to the data files we want to validate.

In [31]:
#Load in the packages
import os
import pandas as pd
import time
from pyproj import Proj
from datetime import datetime, timedelta

In [32]:
#Define the inputs here
latitude = 40.730610
longitude = -73.935242

#Define the range of the years we are interested in
years = [2016, 2017]

#Path to the data files we want to validate
path = "/Volumes/Seagate Portable Drive/Central_Park_Climate/Downloading_SDC_500/Data_Files"

In [33]:
#Let's first convert the latitude and longitude into MODIS tile values
#Define the upper-left corner X, upper-left corner Y, and tile size
WORLD_ULC_X = -20015109.354
WORLD_ULC_Y = 10007554.677
TILE_SIZE = 1111950

#Define the MODIS sinusoidal projection
modis_proj = Proj("+proj=sinu +R=6371007.181 +lon_0=0")
x, y = modis_proj(longitude, latitude)

#Compute h and v
h = int((x - WORLD_ULC_X) / TILE_SIZE)
v = int((WORLD_ULC_Y - y) / TILE_SIZE)

tile_name = f"h{h:02d}v{v:02d}"

In [34]:
#We will first delete any duplicates in our files
for filename in os.listdir(path):
    #Check if the file is a duplicate
    if "(1)" in filename:
        #Then we get the full path and then delete it
        file_path = os.path.join(path, filename)
        print(f"Deleted: {file_path}")
        os.remove(file_path)

In [35]:
#Now we will get a list of all the files in the directory
file_list = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

#Get a list of all the files for the tile of interest
tile_file_list = [f for f in file_list if tile_name in f]

#Define the bands we are interested in
bands = ["b01", "b02", "b03", "b04", "b05", "b06", "b07"]

#Now we will define the list of dates
all_dates = []
for year in range(years[0], years[1]+1):
    #Get the start and end dates of the year
    start_date = datetime(year, 1, 1)
    end_date = datetime(year + 1, 1, 1) - timedelta(days = 1)
    current_date = start_date

    while current_date <= end_date:
        doy = current_date.timetuple().tm_yday
        date_str = f"{year}{doy:03d}"
        all_dates.append(date_str)
        current_date += timedelta(days = 1)

#Loop through the bands
for band in bands:
    #Get the list of the files for the current band
    band_file_list = [f for f in tile_file_list if band in f]

    #Now we will extract the dates from all of the file names and check the missing dates
    file_dates = []
    for filename in band_file_list:
        parts = filename.split("_")
        #This is the date part
        date_str = parts[3]
        file_dates.append(date_str)

    #Now we check for missing dates
    missing_dates = set(all_dates) - set(file_dates)

    print(f"We are missing the following SDC 500m files for tile {tile_name} between the years {years[0]} and {years[1]} for band {band}:")
    print(missing_dates)


    

We are missing the following SDC 500m files for tile h12v04 between the years 2016 and 2017 for band b01:
set()
We are missing the following SDC 500m files for tile h12v04 between the years 2016 and 2017 for band b02:
set()
We are missing the following SDC 500m files for tile h12v04 between the years 2016 and 2017 for band b03:
set()
We are missing the following SDC 500m files for tile h12v04 between the years 2016 and 2017 for band b04:
set()
We are missing the following SDC 500m files for tile h12v04 between the years 2016 and 2017 for band b05:
set()
We are missing the following SDC 500m files for tile h12v04 between the years 2016 and 2017 for band b06:
set()
We are missing the following SDC 500m files for tile h12v04 between the years 2016 and 2017 for band b07:
{'2017100', '2016288', '2016047', '2016129', '2017274', '2016360', '2017326', '2016330', '2016230', '2016115', '2017331', '2016260', '2016263', '2016093', '2016296', '2016113', '2017206', '2017349', '2016293', '2016302', '