In [1]:
import pandas as pd
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from IPython.display import display
from functools import reduce

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def merge_df(dataframes):
    """
    Merges multiple pandas DataFrames on their index.

    Parameters:
    dataframes (list of pandas.DataFrame): The list of DataFrames to merge.
    how (str): Type of merge to perform:
        - 'left': use only keys from left frame (SQL: left outer join)
        - 'right': use only keys from right frame (SQL: right outer join)
        - 'outer': use union of keys from both frames (SQL: full outer join)
        - 'inner': use intersection of keys from both frames (SQL: inner join)
    remove_duplicates (bool): Whether to remove duplicated rows after merging.

    Returns:
    pandas.DataFrame: The merged DataFrame.
    """
    # Start with the first DataFrame in the list
    merged_df = dataframes[0]

    # Iteratively merge each DataFrame in the list
    for df in dataframes[1:]:
        merged_df = merged_df.join(df, how='left')

    return merged_df


In [3]:
def concat_df(dataframes, axis='index'):
    # Identify common columns by intersecting all DataFrame columns
    common_columns = reduce(lambda x, y: x.intersection(y.columns), dataframes, dataframes[0].columns)

    # Reindex all DataFrames to these common columns
    dfs_common = [df[common_columns] for df in dataframes]

    # Concatenate the reindexed DataFrames
    return pd.concat(dfs_common, axis=axis)

In [4]:
def read_parquet(filepath,index=None,loc=None):
    df = pd.read_parquet(filepath)
    if index:
        df.set_index(index, inplace=True)
        df.index.name = 'index'
        df.index = pd.to_datetime(df.index)
        # df['timestamp'] = pd.to_datetime(df.index).astype('datetime64[ns]')
    return df

In [5]:
def get_min_time_delta(df):

    data_sorted = df.sort_index()
    time_deltas = data_sorted.index.to_series().diff().dropna()
    non_zero_deltas = time_deltas[time_deltas != pd.Timedelta(0)]
    min_time_delta = non_zero_deltas.min().total_seconds()

    if min_time_delta < 3600:
        # Convert to minutes
        return f"{min_time_delta // 60:.0f}T"
    else:
        # Convert to hours
        return f"{min_time_delta // 3600:.0f}H"


# Start

In [6]:
Y_train           = read_parquet('/home/andres/ml/data/A/train_targets.parquet',     'time',         'A')
X_train_estimated = read_parquet('/home/andres/ml/data/A/X_train_estimated.parquet', 'date_forecast','A')
X_train_observed  = read_parquet('/home/andres/ml/data/A/X_train_observed.parquet',  'date_forecast','A')

In [7]:
X_train = concat_df([X_train_estimated, X_train_observed])
X_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 136245 entries, 2022-10-28 22:00:00 to 2022-10-21 01:00:00
Data columns (total 45 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   absolute_humidity_2m:gm3        136245 non-null  float32
 1   air_density_2m:kgm3             136245 non-null  float32
 2   ceiling_height_agl:m            110079 non-null  float32
 3   clear_sky_energy_1h:J           136245 non-null  float32
 4   clear_sky_rad:W                 136245 non-null  float32
 5   cloud_base_agl:m                126085 non-null  float32
 6   dew_or_rime:idx                 136245 non-null  float32
 7   dew_point_2m:K                  136245 non-null  float32
 8   diffuse_rad:W                   136245 non-null  float32
 9   diffuse_rad_1h:J                136245 non-null  float32
 10  direct_rad:W                    136245 non-null  float32
 11  direct_rad_1h:J                 136245 non-n

In [8]:
def interpolate(data):

    interpolation_methods = {
        'absolute_humidity_2m:gm3': 'index',
        'air_density_2m:kgm3': 'index',
        'ceiling_height_agl:m': 'index',
        'clear_sky_energy_1h:J': 'cubic',
        'clear_sky_rad:W': 'cubic',
        'cloud_base_agl:m': 'pchip',
        'dew_or_rime:idx': 'nearest',
        'dew_point_2m:K': 'linear',
        'diffuse_rad:W': 'cubic',
        'diffuse_rad_1h:J': 'cubic',
        'direct_rad:W': 'cubic',
        'direct_rad_1h:J': 'cubic',
        'effective_cloud_cover:p': 'index',
        'elevation:m': 'pad',
        'fresh_snow_12h:cm': 'zero',
        'fresh_snow_1h:cm': 'zero',
        'fresh_snow_24h:cm': 'zero',
        'fresh_snow_3h:cm': 'zero',
        'fresh_snow_6h:cm': 'zero',
        'is_day:idx': 'pad',
        'is_in_shadow:idx': 'pad',
        'msl_pressure:hPa': 'time',
        'precip_5min:mm': 'index',
        'precip_type_5min:idx': 'nearest',
        'pressure_100m:hPa': 'index',
        'pressure_50m:hPa': 'index',
        'prob_rime:p': 'index',
        'rain_water:kgm2': 'index',
        'relative_humidity_1000hPa:p': 'index',
        'sfc_pressure:hPa': 'time',
        'snow_density:kgm3': 'zero',
        'snow_depth:cm': 'nearest',
        'snow_drift:idx': 'pad',
        'snow_melt_10min:mm': 'index',
        'snow_water:kgm2': 'index',
        'sun_azimuth:d': 'cubic',
        'sun_elevation:d': 'cubic',
        'super_cooled_liquid_water:kgm2': 'index',
        't_1000hPa:K': 'index',
        'total_cloud_cover:p': 'index',
        'visibility:m': 'index',
        'wind_speed_10m:ms': 'index',
        'wind_speed_u_10m:ms': 'index',
        'wind_speed_v_10m:ms': 'index',
        'wind_speed_w_1000hPa:ms': 'index',
        'pv_measurement':'index'
    }
    for column in data.columns:
        data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
        if data[column].isna().any():
            print(column + ' has NaN, replacing NaN with 0.')
            data[column] = data[column].fillna(0)
    return data

In [9]:
freq = get_min_time_delta(X_train)
X_train = X_train.resample(freq).asfreq(fill_value=None)
train_data = merge_df([interpolate(X_train),interpolate(Y_train)])
interpolate(train_data)
train_data['timestamp'] = train_data.index.astype('datetime64[ns]')
print(train_data['timestamp'])
train_data['location'] = 'A'
train_data = TimeSeriesDataFrame.from_data_frame(
    train_data,
    id_column="location",
    timestamp_column="timestamp",
)


snow_density:kgm3 has NaN, replacing NaN with 0.
index
2019-06-02 22:00:00   2019-06-02 22:00:00
2019-06-02 22:15:00   2019-06-02 22:15:00
2019-06-02 22:30:00   2019-06-02 22:30:00
2019-06-02 22:45:00   2019-06-02 22:45:00
2019-06-02 23:00:00   2019-06-02 23:00:00
                              ...        
2023-04-30 22:45:00   2023-04-30 22:45:00
2023-04-30 23:00:00   2023-04-30 23:00:00
2023-04-30 23:15:00   2023-04-30 23:15:00
2023-04-30 23:30:00   2023-04-30 23:30:00
2023-04-30 23:45:00   2023-04-30 23:45:00
Freq: 15T, Name: timestamp, Length: 137096, dtype: datetime64[ns]


  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)


In [19]:
test = pd.read_csv('/home/andres/ml/data/test.csv', index_col=['location', 'time'], parse_dates=['time'])
# test.index.rename(names={'time': 'index', 'location': 'location'}, inplace=True)
print(test)
sub_df = test.loc['A']
test = sub_df
# print(sub_df)
# freq = get_min_time_delta(sub_df)
# print(freq)
# test = sub_df.resample(freq).asfreq(fill_value=None)
num_predictions = test.shape[0]
print(num_predictions)

                                id  prediction
location time                                 
A        2023-05-01 00:00:00     0           0
         2023-05-01 01:00:00     1           0
         2023-05-01 02:00:00     2           0
         2023-05-01 03:00:00     3           0
         2023-05-01 04:00:00     4           0
...                            ...         ...
C        2023-07-03 19:00:00  2155           0
         2023-07-03 20:00:00  2156           0
         2023-07-03 21:00:00  2157           0
         2023-07-03 22:00:00  2158           0
         2023-07-03 23:00:00  2159           0

[2160 rows x 2 columns]
                      id  prediction
time                                
2023-05-01 00:00:00    0           0
2023-05-01 01:00:00    1           0
2023-05-01 02:00:00    2           0
2023-05-01 03:00:00    3           0
2023-05-01 04:00:00    4           0
...                  ...         ...
2023-07-03 19:00:00  715           0
2023-07-03 20:00:00  716       

In [11]:
(train_data_s, test_data_s) = train_data.train_test_split(prediction_length=num_predictions)

In [12]:
X_pred = read_parquet('/home/andres/ml/data/A/X_test_estimated.parquet',  'date_forecast','A')
freq = get_min_time_delta(X_pred)
X_pred = X_pred.resample(freq).asfreq(fill_value=None)
interpolate(X_pred)
X_pred['timestamp'] = X_pred.index.astype('datetime64[ns]')
print(X_pred['timestamp'])
X_pred['location'] = 'A'
X_pred = TimeSeriesDataFrame.from_data_frame(
    X_pred,
    id_column="location",
    timestamp_column="timestamp",
)


  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
  data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)


snow_density:kgm3 has NaN, replacing NaN with 0.
index
2023-05-01 00:00:00   2023-05-01 00:00:00
2023-05-01 00:15:00   2023-05-01 00:15:00
2023-05-01 00:30:00   2023-05-01 00:30:00
2023-05-01 00:45:00   2023-05-01 00:45:00
2023-05-01 01:00:00   2023-05-01 01:00:00
                              ...        
2023-07-03 22:45:00   2023-07-03 22:45:00
2023-07-03 23:00:00   2023-07-03 23:00:00
2023-07-03 23:15:00   2023-07-03 23:15:00
2023-07-03 23:30:00   2023-07-03 23:30:00
2023-07-03 23:45:00   2023-07-03 23:45:00
Freq: 15T, Name: timestamp, Length: 6144, dtype: datetime64[ns]


# Model Selection & Training

In [13]:
print(num_predictions)

1536


In [14]:
predictor = TimeSeriesPredictor(
    prediction_length=num_predictions,
    path="autogluon",
    target="pv_measurement",
    eval_metric="MSE",
)

predictor.fit(
    train_data_s,
    presets="fast_training",
    time_limit=600,
)

TimeSeriesPredictor.fit() called
Setting presets to: fast_training
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'MSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'fast_training',
 'num_val_windows': 1,
 'prediction_length': 1536,
 'random_seed': None,
 'target': 'pv_measurement',
 'time_limit': 600,
 'verbosity': 2}
Provided training data set with 135560 rows, 1 items (item = single time series). Average time series length is 135560.0. Data frequency is '15T'.
AutoGluon will save models to autogluon/
AutoGluon will gauge predictive performance using evaluation metric: 'MSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'pv_measurement'
	past covariates:  ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', '

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7f0d8a3184f0>

# Model Evaluation

In [15]:
print(predictor.leaderboard(test_data_s, silent=True))

Additional data provided, testing on additional data. Resulting leaderboard will be sorted according to test score (`score_test`).


              model    score_test     score_val  pred_time_test  \
0             Theta -1.069123e+06 -1.539922e+06       13.160655   
1  WeightedEnsemble -1.278381e+06 -3.782835e+05       33.163072   
2  RecursiveTabular -1.303030e+06 -4.991115e+05       16.849710   
3     SeasonalNaive -1.352936e+06 -5.574025e+05        1.577496   
4             Naive -3.000111e+06 -4.262677e+06        1.572455   
5               ETS -1.327486e+07 -2.811896e+06        7.362496   

   pred_time_val  fit_time_marginal  fit_order  
0      13.187943           0.093085          3  
1      34.480164           0.830675          6  
2      17.976666           3.847470          5  
3       1.570396           0.095502          2  
4       1.745158           0.094488          1  
5       5.704984           0.093964          4  


In [16]:
predictor.evaluate(test_data_s)

Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


-1278380.9952781294

# Testing

In [17]:
predictions = predictor.predict(train_data, known_covariates=X_pred)
print(predictions.info())

Global seed set to 123
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


<class 'autogluon.timeseries.dataset.ts_dataframe.TimeSeriesDataFrame'>
MultiIndex: 1536 entries, ('A', Timestamp('2023-05-01 00:00:00')) to ('A', Timestamp('2023-05-16 23:45:00'))
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mean    1536 non-null   float64
 1   0.1     1536 non-null   float64
 2   0.2     1536 non-null   float64
 3   0.3     1536 non-null   float64
 4   0.4     1536 non-null   float64
 5   0.5     1536 non-null   float64
 6   0.6     1536 non-null   float64
 7   0.7     1536 non-null   float64
 8   0.8     1536 non-null   float64
 9   0.9     1536 non-null   float64
dtypes: float64(10)
memory usage: 136.7+ KB
None


In [18]:
print(num_predictions)

1536
