In [1]:
import io
import os
from datetime import datetime, timedelta

import xarray as xr
import requests
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import cmocean

import pandas as pd
import numpy as np

import tempfile
import wget

# Not used directly, but used via xarray
import cfgrib

In [2]:
def get_row(df, uid):
    try:
        return df[df.uid == uid].iloc[0]
    except:
        return None

In [3]:
metadata = pd.read_csv('../data/metadata.csv')

In [4]:
failed_points = pd.read_csv('../data/downloaded/failed/sentinel.csv')

In [5]:
metadata.head()

Unnamed: 0,uid,latitude,longitude,date,split
0,aabm,39.080319,-86.430867,2018-05-14,train
1,aabn,36.5597,-121.51,2016-08-31,test
2,aacd,35.875083,-78.878434,2020-11-19,train
3,aaee,35.487,-79.062133,2016-08-24,train
4,aaff,38.049471,-99.827001,2019-07-23,train


In [6]:
failed_points.head()

Unnamed: 0.1,Unnamed: 0,uid
0,0,aaig
1,1,aapj
2,2,aaqf
3,3,aaia
4,4,aaoj


In [7]:
# Crear nuevo dataframe con los valores de uid que están en metadata pero no están en failed_points
valid_points = pd.merge(metadata, failed_points['uid'], on='uid', how='left', indicator=True)
valid_points = valid_points[valid_points['_merge'] == 'left_only']

# Eliminar la columna auxiliar _merge
valid_points = valid_points.drop('_merge', axis=1)

valid_points.head()

Unnamed: 0,uid,latitude,longitude,date,split
0,aabm,39.080319,-86.430867,2018-05-14,train
1,aabn,36.5597,-121.51,2016-08-31,test
2,aacd,35.875083,-78.878434,2020-11-19,train
3,aaee,35.487,-79.062133,2016-08-24,train
4,aaff,38.049471,-99.827001,2019-07-23,train


In [8]:
len(valid_points)

9476

In [9]:
example_row = get_row(valid_points, 'aabm')

In [10]:
example_row

uid                aabm
latitude      39.080319
longitude    -86.430867
date         2018-05-14
split             train
Name: 0, dtype: object

In [11]:
dates = valid_points.date

In [12]:
dates = dates.drop_duplicates()

In [13]:
len(dates)

922

In [14]:
def get_url(date):
    # Constants for creating the full URL
    blob_container = 'https://noaa-hrrr-bdp-pds.s3.amazonaws.com'
    sector = "conus"
    date = datetime.strptime(example_row.date, "%Y-%m-%d").date()
    cycle = 1           # noon
    forecast_hour = 1   # offset from cycle time
    product = "wrfsfcf" # 2D surface levels
    
    # Put it all together
    file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2"
    url = f"{blob_container}/hrrr.{date:%Y%m%d}/{sector}/{file_path}"
    return url

In [15]:
def get_file(date):
    url = get_url(date)
    file_path = f'../data/downloaded/nrr/tmp/nrr_{date}.txt'
    file_name = wget.download(url, out=file_path)
    return file_name

In [16]:
def get_features(file_name, row):
    ds = xr.open_dataset(
        file_name, 
        engine='cfgrib',
        backend_kwargs={'indexpath':''},
        filter_by_keys={'stepType': 'instant', 'typeOfLevel': 'surface'}
    )
    lat = row.latitude
    lon = row.longitude + 360
    
    abslat = np.abs(ds.latitude-lat)
    abslon = np.abs(ds.longitude-lon)
    c = np.maximum(abslon, abslat)
    
    ([xloc], [yloc]) = np.where(c == np.min(c))
    
    meta_info = {'uid': example_row['uid']}
    for varname, da in ds.sel(y=xloc, x=yloc).data_vars.items():
        meta_info[da.attrs['long_name']] = da.values.item()
    return meta_info

In [17]:
def gen_features(date, df, features):
    file_name = get_file(date)
    rows = df[df['date'] == date]
    for _, row in rows.iterrows():
        feature = get_features(file_name, row)
        features.append(feature)
    os.remove(file_name)

In [18]:
%%time
feat = []
date = example_row.date

gen_features(date, valid_points, feat)

100% [......................................................................] 109515101 / 109515101CPU times: user 1min 16s, sys: 9.61 s, total: 1min 26s
Wall time: 6min 26s


In [19]:
test = valid_points[valid_points['date'] == example_row.date]

In [20]:
len(test) == len(feat)

True