In [28]:
import pandas as pd
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from IPython.display import display
from functools import reduce
import os
import glob

In [29]:
def merge_df(dataframes):
    """
    Merges multiple pandas DataFrames on their index.

    Parameters:
    dataframes (list of pandas.DataFrame): The list of DataFrames to merge.
    how (str): Type of merge to perform:
        - 'left': use only keys from left frame (SQL: left outer join)
        - 'right': use only keys from right frame (SQL: right outer join)
        - 'outer': use union of keys from both frames (SQL: full outer join)
        - 'inner': use intersection of keys from both frames (SQL: inner join)
    remove_duplicates (bool): Whether to remove duplicated rows after merging.

    Returns:
    pandas.DataFrame: The merged DataFrame.
    """
    # Start with the first DataFrame in the list
    merged_df = dataframes[0]

    # Iteratively merge each DataFrame in the list
    for df in dataframes[1:]:
        merged_df = merged_df.join(df, how='left')

    return merged_df


In [30]:
def concat_df(dataframes, axis='index'):
    # Identify common columns by intersecting all DataFrame columns
    common_columns = reduce(lambda x, y: x.intersection(y.columns), dataframes, dataframes[0].columns)

    # Reindex all DataFrames to these common columns
    dfs_common = [df[common_columns] for df in dataframes]

    # Concatenate the reindexed DataFrames
    return pd.concat(dfs_common, axis=axis)

In [31]:
def read_parquet(filepath):
    try:
        # Read the file
        df = pd.read_parquet(filepath)
        
        columns = ['date_forecast', 'time']

        for column in columns:
            if column in df.columns:
                df.set_index(column, inplace=True)
                break
        else:
            print("Datetime column not found")
            
        # Get the location from the filepath
        location = os.path.basename(os.path.dirname(filepath))
        data_type = os.path.basename(filepath).rsplit('.parquet')[0]
        
        # Create a MultiIndex
        df.index = pd.MultiIndex.from_product([[data_type], [location], df.index], names=['Data_Type', 'Location', 'Time'])
        
        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()

In [32]:
def get_min_time_delta(df):

    data_sorted = df.sort_index()
    time_deltas = data_sorted.index.to_series().diff().dropna()
    non_zero_deltas = time_deltas[time_deltas != pd.Timedelta(0)]
    min_time_delta = non_zero_deltas.min().total_seconds()

    if min_time_delta < 3600:
        # Convert to minutes
        return f"{min_time_delta // 60:.0f}T"
    else:
        # Convert to hours
        return f"{min_time_delta // 3600:.0f}H"


# Start

In [33]:

dataframes = []
directory = '/home/andres/ml/data/'
filepaths = glob.glob(os.path.join(directory, '**', '*.parquet'), recursive=True)

for filepath in filepaths:
    df = read_parquet(filepath)
    dataframes.append(df)
data = pd.concat(dataframes)

In [34]:
X_train_observed = data.xs('X_train_observed')
X_train_estimated = data.xs('X_train_estimated')
X_train = pd.concat([X_train_observed, X_train_estimated])
Y_train = data.xs('train_targets')

In [35]:
def interpolate(data):

    interpolation_methods = {
        'absolute_humidity_2m:gm3': 'index',
        'air_density_2m:kgm3': 'index',
        'ceiling_height_agl:m': 'index',
        'clear_sky_energy_1h:J': 'cubic',
        'clear_sky_rad:W': 'cubic',
        'cloud_base_agl:m': 'pchip',
        'dew_or_rime:idx': 'nearest',
        'dew_point_2m:K': 'linear',
        'diffuse_rad:W': 'cubic',
        'diffuse_rad_1h:J': 'cubic',
        'direct_rad:W': 'cubic',
        'direct_rad_1h:J': 'cubic',
        'effective_cloud_cover:p': 'index',
        'elevation:m': 'pad',
        'fresh_snow_12h:cm': 'zero',
        'fresh_snow_1h:cm': 'zero',
        'fresh_snow_24h:cm': 'zero',
        'fresh_snow_3h:cm': 'zero',
        'fresh_snow_6h:cm': 'zero',
        'is_day:idx': 'pad',
        'is_in_shadow:idx': 'pad',
        'msl_pressure:hPa': 'time',
        'precip_5min:mm': 'index',
        'precip_type_5min:idx': 'nearest',
        'pressure_100m:hPa': 'index',
        'pressure_50m:hPa': 'index',
        'prob_rime:p': 'index',
        'rain_water:kgm2': 'index',
        'relative_humidity_1000hPa:p': 'index',
        'sfc_pressure:hPa': 'time',
        'snow_density:kgm3': 'zero',
        'snow_depth:cm': 'nearest',
        'snow_drift:idx': 'pad',
        'snow_melt_10min:mm': 'index',
        'snow_water:kgm2': 'index',
        'sun_azimuth:d': 'cubic',
        'sun_elevation:d': 'cubic',
        'super_cooled_liquid_water:kgm2': 'index',
        't_1000hPa:K': 'index',
        'total_cloud_cover:p': 'index',
        'visibility:m': 'index',
        'wind_speed_10m:ms': 'index',
        'wind_speed_u_10m:ms': 'index',
        'wind_speed_v_10m:ms': 'index',
        'wind_speed_w_1000hPa:ms': 'index',
        'pv_measurement':'index'
    }
    for column in data.columns:
        data[column].interpolate(method=interpolation_methods.get(column, 'linear'), inplace=True)
        if data[column].isna().any():
            print(column + ' has NaN, replacing NaN with 0.')
            data[column] = data[column].fillna(0)
    return data

In [36]:
sub_dfs = []
for index_combination, sub_df in X_train.groupby(level='Location'):
    sub_df = sub_df.reset_index(level='Location').resample('15T').asfreq(fill_value=None).drop(['pv_measurement','date_calc'], axis=1)
    Y_train = Y_train.pv_measurement.reset_index(level='Location')
    interpolate(Y_train)
    Y_train.set_index('Location', inplace=True, append=True)
    Y_train = Y_train.swaplevel()
    train_data = merge_df([interpolate(sub_df),Y_train])
    train_data['Location'] = index_combination
    train_data.set_index('Location', inplace=True, append=True)
    train_data = train_data.swaplevel()
    sub_dfs.append(train_data)
df = pd.concat(sub_dfs)

In [10]:
df = df.reset_index()
df['Time'] = df['Time'].astype('datetime64[ns]')
train_data = TimeSeriesDataFrame.from_data_frame(
    df,
    id_column="Location",
    timestamp_column="Time",
)

In [11]:
test = pd.read_csv('/home/andres/ml/data/test.csv', index_col=['location', 'time'], parse_dates=['time'])
sub_df = test.loc['A']
test = sub_df
num_predictions = test.shape[0]


In [12]:
(train_data_s, test_data_s) = train_data.train_test_split(prediction_length=num_predictions)

In [13]:
X_pred = read_parquet('/home/andres/ml/data/A/X_test_estimated.parquet',  'date_forecast','A')
freq = get_min_time_delta(X_pred)
X_pred = X_pred.resample(freq).asfreq(fill_value=None)
interpolate(X_pred)
X_pred['timestamp'] = X_pred.index.astype('datetime64[ns]')
print(X_pred['timestamp'])
X_pred['location'] = 'A'
X_pred = TimeSeriesDataFrame.from_data_frame(
    X_pred,
    id_column="location",
    timestamp_column="timestamp",
)


# Model Selection & Training

In [14]:
print(num_predictions)

720


In [15]:
predictor = TimeSeriesPredictor(
    prediction_length=num_predictions,
    path="autogluon",
    target="pv_measurement",
    eval_metric="MSE",
)

predictor.fit(
    train_data_s,
    presets="fast_training",
    time_limit=600,
)



# Model Evaluation

In [16]:
display(predictor.leaderboard(test_data_s, silent=True))

In [17]:
predictor.evaluate(test_data_s)

# Testing

In [18]:
predictions = predictor.predict(train_data, known_covariates=X_pred)
print(predictions.info())