In [1]:
import io
import os
from datetime import datetime, timedelta

import xarray as xr
import requests
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import cmocean

import pandas as pd
import numpy as np

import tempfile
import wget
import requests

# Not used directly, but used via xarray
import cfgrib

from multiprocessing import Pool, Manager
from tqdm import tqdm

In [2]:
def get_row(df, uid):
    try:
        return df[df.uid == uid].iloc[0]
    except:
        return None

In [3]:
metadata = pd.read_csv('../data/metadata.csv')

In [4]:
failed_points = pd.read_csv('../data/downloaded/failed/sentinel.csv')

In [5]:
metadata.head()

Unnamed: 0,uid,latitude,longitude,date,split
0,aabm,39.080319,-86.430867,2018-05-14,train
1,aabn,36.5597,-121.51,2016-08-31,test
2,aacd,35.875083,-78.878434,2020-11-19,train
3,aaee,35.487,-79.062133,2016-08-24,train
4,aaff,38.049471,-99.827001,2019-07-23,train


In [6]:
failed_points.head()

Unnamed: 0.1,Unnamed: 0,uid
0,0,aaig
1,1,aapj
2,2,aaqf
3,3,aaia
4,4,aaoj


In [7]:
# Crear nuevo dataframe con los valores de uid que están en metadata pero no están en failed_points
valid_points = pd.merge(metadata, failed_points['uid'], on='uid', how='left', indicator=True)
valid_points = valid_points[valid_points['_merge'] == 'left_only']

# Eliminar la columna auxiliar _merge
valid_points = valid_points.drop('_merge', axis=1)

valid_points.head()

Unnamed: 0,uid,latitude,longitude,date,split
0,aabm,39.080319,-86.430867,2018-05-14,train
1,aabn,36.5597,-121.51,2016-08-31,test
2,aacd,35.875083,-78.878434,2020-11-19,train
3,aaee,35.487,-79.062133,2016-08-24,train
4,aaff,38.049471,-99.827001,2019-07-23,train


In [8]:
len(valid_points)

9476

In [9]:
example_row = get_row(valid_points, 'aabm')

In [10]:
example_row

uid                aabm
latitude      39.080319
longitude    -86.430867
date         2018-05-14
split             train
Name: 0, dtype: object

In [11]:
dates = valid_points.date

In [12]:
dates = dates.drop_duplicates()

In [13]:
len(dates)

922

In [14]:
def get_url(date):
    # Constants for creating the full URL
    blob_container = 'https://noaa-hrrr-bdp-pds.s3.amazonaws.com'
    sector = "conus"
    date = datetime.strptime(date, "%Y-%m-%d").date()
    cycle = 1           # noon
    forecast_hour = 1   # offset from cycle time
    product = "wrfsfcf" # 2D surface levels
    
    # Put it all together
    file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2"
    url = f"{blob_container}/hrrr.{date:%Y%m%d}/{sector}/{file_path}"
    return url

In [15]:
def get_file(date):
    url = get_url(date)
    file_path = f'../data/downloaded/nrr/tmp/nrr_{date}.txt'
    resp = requests.get(url, timeout=2)
    f = open(file_path, 'wb')
    f.write(resp.content)
    
    return file_path

In [16]:
def get_ds(file_name):
    return xr.open_dataset(
        file_name, 
        engine='cfgrib',
        backend_kwargs={'indexpath':''},
        filter_by_keys={'stepType': 'instant', 'typeOfLevel': 'surface'}
    )

In [17]:
def get_features(ds, row):
    lat = row.latitude
    lon = row.longitude + 360
    
    abslat = np.abs(ds.latitude-lat)
    abslon = np.abs(ds.longitude-lon)
    c = np.maximum(abslon, abslat)
    
    ([xloc], [yloc]) = np.where(c == np.min(c))
    
    meta_info = {'uid': row['uid']}
    for varname, da in ds.sel(y=xloc, x=yloc).data_vars.items():
        meta_info[da.attrs['long_name']] = da.values.item()
    return meta_info

In [18]:
def gen_features(date, df, features):
    file_name = get_file(date)
    rows = df[df['date'] == date]
    ds = get_ds(file_name)
    for _, row in rows.iterrows():
        feature = get_features(ds, row)
        features.append(feature)
    os.remove(file_name)

In [19]:
%%time
feat = []
date = example_row.date

gen_features(date, valid_points, feat)

CPU times: user 12.6 s, sys: 5.06 s, total: 17.7 s
Wall time: 4min 25s


In [20]:
test = valid_points[valid_points['date'] == example_row.date]

In [21]:
len(test) == len(feat)

True

In [22]:
%%time
def save_features_wrapper(args):
    date, df, features = args
    gen_features(date, df, features)

# Utilizamos una lista compartida para almacenar los puntos válidos
manager = Manager()
features = manager.list()

head = len(dates.head(2))

# Obtener el número total de filas
total_rows = len(dates.head(head))

# Crear un iterable de argumentos para el método map
args = [(date, valid_points, features) for date in dates.head(head)]

# Crear un Pool de procesos
with Pool(processes=32) as pool:
    # Utilizar tqdm para la barra de progreso
    with tqdm(total=total_rows) as pbar:
        # Mapear la función sobre los argumentos
        for _ in pool.imap_unordered(save_features_wrapper, args):
            pbar.update(1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [04:40<00:00, 140.28s/it]

CPU times: user 36.5 ms, sys: 154 ms, total: 190 ms
Wall time: 4min 40s





In [23]:
len(features)

94

In [24]:
features_df = pd.DataFrame.from_records(features)

In [25]:
features_df.head()

Unnamed: 0,uid,Visibility,Wind speed (gust),Surface pressure,Orography,Temperature,Plant canopy surface water,Water equivalent of accumulated snow depth (deprecated),Snow cover,Snow depth,...,Convective inhibition,Downward short-wave radiation flux,Downward long-wave radiation flux,Upward short-wave radiation flux,Upward long-wave radiation flux,Visible Beam Downward Solar Flux,Visible Diffuse Downward Solar Flux,Boundary layer height,Land-sea mask,Sea ice area fraction
0,aabn,23500.0,9.940825,100870.0,35.276505,296.571777,0.0,0.0,0.0,0.0,...,0.0,59.400002,346.200012,8.125,438.562164,249.0,33.900002,406.234253,1.0,0.0
1,agtu,14200.0,8.503325,100300.0,106.682755,297.821777,0.0,0.0,0.0,0.0,...,-47.0,0.0,397.600006,0.0,446.187164,0.0,0.0,158.296753,1.0,0.0
2,aozi,17000.0,9.190825,100610.0,75.120255,299.259277,0.0,0.0,0.0,0.0,...,-53.0,0.0,396.0,0.0,454.937164,0.0,0.0,420.234253,1.0,0.0
3,atvc,17000.0,9.190825,100610.0,75.120255,299.259277,0.0,0.0,0.0,0.0,...,-53.0,0.0,396.0,0.0,454.937164,0.0,0.0,420.234253,1.0,0.0
4,ayhg,16200.0,4.878325,100010.0,128.964005,298.571777,0.0,0.0,0.0,0.0,...,-35.0,0.0,391.700012,0.0,450.437164,0.0,0.0,400.921753,1.0,0.0
