In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import os

from tqdm import tqdm
tqdm.pandas()

import dask.dataframe as dd

In [21]:
from tsfresh import extract_features
from tsfresh import extract_relevant_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import EfficientFCParameters
settings = EfficientFCParameters()
from tsfresh.utilities.distribution import MultiprocessingDistributor
Distributor = MultiprocessingDistributor(n_workers=4,
                                         disable_progressbar=False,
                                         progressbar_title="Feature Extraction")

In [5]:
data_path = "C:/Users/janwe/Desktop/Uni/ETH/HS2023/Hack4Good/Data/"

# Files and Folders of interest
cache_file = data_path + 'time_series_features.csv'
path_weather_data = data_path + 'era5_land_t2m_pev_tp.csv'
img_dir = data_path + "images"
label_path = data_path + "labels.csv"

In [3]:


df = pd.read_csv(path_weather_data, index_col=(0, 1, 2))
df.index = df.index.set_levels(df.index.levels[2].astype('datetime64[ns]'), level=2)
weather_data = df.to_xarray()
labels = pd.read_csv(label_path)
labels['date'] = pd.to_datetime(labels['date'], format='mixed')

In [41]:
def get_coords(img_name, labels, join_column):
    row = labels[labels[join_column] == img_name]
    return (row.iloc[0]['lat'], row.iloc[0]['lon'])

def old_extract_features(img_name, weather_data, labels, join_column):
    # get relevant data
    frames=[]
    for v,i in enumerate(img_name):
        (lon, lat) = get_coords(i, labels, join_column)
        date = labels[labels[join_column] == i].iloc[0]['date']
        df = weather_data.sel(latitude= lat, longitude= lon, method='nearest').sel(time = slice(date - pd.DateOffset(days=30, second=1), date)).to_dataframe()
        df.drop('longitude',axis=1,inplace=True)
        df.drop('latitude',axis=1,inplace=True)
        df['id']=v
        frames.append(df)
    final = pd.concat(frames)
    final.to_csv(data_path+"done.csv")
    # extract features
    day = (np.sin(2 * np.pi * date.timetuple().tm_yday/365.0), np.cos(2 * np.pi * date.timetuple().tm_yday/365.0))

In [46]:
old_extract_features(labels['filename'],weather_data,labels,'filename')

In [6]:
test = pd.read_csv(data_path+'test_ts.csv')

In [14]:
sub1=test.loc[test['id']%5==0]
sub2=test.loc[test['id']%5==1]
sub3=test.loc[test['id']%5==2]
sub4=test.loc[test['id']%5==3]
sub5=test.loc[test['id']%5==4]

In [8]:

X1 = extract_features(sub1, column_id='id', column_sort='time',default_fc_parameters=settings)

Feature Extraction: 100%|██████████| 20/20 [16:17<00:00, 48.87s/it]  


In [12]:
X2 = extract_features(sub2, column_id='id', column_sort='time',default_fc_parameters=settings,distributor=Distributor)

Feature Extraction: 100%|██████████| 40/40 [15:59<00:00, 23.98s/it]  


In [22]:
X5 = extract_features(sub5, column_id='id', column_sort='time',default_fc_parameters=settings,distributor=Distributor)
X5.to_csv(data_path+'ts_features5.csv')

Feature Extraction: 100%|██████████| 20/20 [15:03<00:00, 45.16s/it]  


In [23]:
Frames = [X1,X2,X3,X4,X5]

In [24]:
tf_features_full = pd.concat(Frames)

In [26]:
tf_features_full_narm=tf_features_full.dropna(axis=1)

In [29]:
tf_features_full_narm.to_csv(data_path+'tf_features_full_narm.csv')