In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import os

from tqdm import tqdm
tqdm.pandas()
from tsfresh import extract_features
from tsfresh import extract_relevant_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import EfficientFCParameters 
settings = EfficientFCParameters() #extract only time series features which are efficient to compute

In [None]:
data_path = "../data/"

# Files and Folders of interest
cache_file = data_path + 'time_series_features.csv'
path_weather_data = data_path + 'era5_land_t2m_pev_tp.csv'
label_path = data_path + "labels.csv"

df = pd.read_csv(path_weather_data, index_col=(0, 1, 2))
df.index = df.index.set_levels(df.index.levels[2].astype('datetime64[ns]'), level=2)
weather_data = df.to_xarray()
labels = pd.read_csv(label_path)
labels['date'] = pd.to_datetime(labels['date'], format='mixed')


In [None]:
def get_coords(img_name, labels, join_column):
    row = labels[labels[join_column] == img_name]
    return (row.iloc[0]['lat'], row.iloc[0]['lon'])

def get_ts_of_coordinates(img_name, weather_data, labels, join_column):
    # get the time series for each coordinate indexed by its position in the labels.csv
    frames=[]
    for v,i in enumerate(img_name):
        (lon, lat) = get_coords(i, labels, join_column)
        date = labels[labels[join_column] == i].iloc[0]['date']
        df = weather_data.sel(latitude= lat, longitude= lon, method='nearest').sel(time = slice(date - pd.DateOffset(days=30, second=1), date)).to_dataframe()
        df.drop('longitude',axis=1,inplace=True)
        df.drop('latitude',axis=1,inplace=True)
        df['id']=v
        frames.append(df)
    final = pd.concat(frames)
    final.to_csv(data_path+"ts_coord.csv")

In [None]:
get_ts_of_coordinates(labels['filename'],weather_data,labels,'filename')
ts_coord = pd.read_csv(data_path+'ts_coord.csv')

In [None]:
# Feature Extraction using tsfresh
# !!! Using the full dataframe will likely not work. Feature extraction takes ~ 45 min for 4000 coordinates !!!

ts_features = extract_features(ts_coord, column_id='id', column_sort='time',default_fc_parameters=settings)
ts_features.dropna(axis=1, inplace = True)

ts_features.to_csv(data_path+"ts_features.csv")
