In [8]:
import pyarrow.parquet as pf
import os
import pandas as pd

def iterate_levels():
  for year in range(2020, 2024):
    for month in range(1, 13):
      filepath = f'data/level/river_wear_level_{year}_{month}.parquet'
      if not os.path.exists(filepath):
        continue
      yield pd.read_parquet(filepath)
            
def iterate_flow():
  for year in range(2020, 2024):
    for month in range(1, 13):
      filepath = f'data/flow/river_wear_flow_{year}_{month}.parquet'
      if not os.path.exists(filepath):
        continue
      yield pd.read_parquet(filepath)

In [17]:
class MeasureRandomAccess:
  def __init__(self, max_cached_files=5):
    self.cache = {}
    self.max_cached_files = max_cached_files
    
  def _timestamp_to_filepath(self, timestamp: pd.Timestamp):
    raise NotImplementedError()
  
  def _check_cache(self, timestamp: pd.Timestamp):
    if timestamp in self.cache:
      return self.cache[timestamp]
    
  def get_timestamp(self, timestamp: pd.Timestamp):
    if self._check_cache(timestamp):
      dataframe = self._check_cache(timestamp)
    else:
      filepath = self._timestamp_to_filepath(timestamp)
      dataframe = pd.read_parquet(filepath)
      self.cache[timestamp] = dataframe
      if len(self.cache) > self.max_cached_files:
        self.cache.popitem()
        
    return dataframe.loc[timestamp]

class FlowRandomAccess(MeasureRandomAccess):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
    
  def _timestamp_to_filepath(self, timestamp: pd.Timestamp):
    return f'data/flow/river_wear_flow_{timestamp.year}_{timestamp.month}.parquet'
  
class LevelRandomAccess(MeasureRandomAccess):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
    
  def _timestamp_to_filepath(self, timestamp: pd.Timestamp):
    return f'data/level/river_wear_level_{timestamp.year}_{timestamp.month}.parquet'
  
class RainfallRandomAccess(MeasureRandomAccess):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
    
  def _timestamp_to_filepath(self, timestamp: pd.Timestamp):
    return f'data/rainfall0/rainfall_{timestamp.year}_{timestamp.month}.parquet'  

In [33]:
import numpy as np

class DatasetBuilder:
  def __init__(self, X_shifts = {
    'flow': [-15, 0],
    'level': [-15, 0],
    'rainfall': [-15, 0],
  }, y_shifts = {
    'level': [15]
  }
  ):
    self.X_shifts = X_shift
    self.y_shifts = y_shift
    self.flow_random_access = FlowRandomAccess()
    self.level_random_access = LevelRandomAccess()
    self.rainfall_random_access = RainfallRandomAccess()
    
  def generate_data(self, start_timestamp: pd.Timestamp, end_timestamp: pd.Timestamp):
    while True:
      # generate a random timestamp to 15 mins
      timestamp = pd.Timestamp(np.random.randint(start_timestamp.value, end_timestamp.value, dtype=np.int64)).round('15min')
      yield self.build_record(timestamp)
    
  def build_record(self,timestamp: pd.Timestamp):
    for flow_offset in self.shifts['flow']:
      # record[i] = self.flow_random_access.get_timestamp(timestamp + pd.Timedelta(minutes=flow_offset))['value']
      flow_values = self.flow_random_access.get_timestamp(timestamp + pd.Timedelta(minutes=flow_offset)).values
    for level_offset in self.shifts['level']:
      # record[i] = self.level_random_access.get_timestamp(timestamp + pd.Timedelta(minutes=level_offset))['value']
      level_values = self.level_random_access.get_timestamp(timestamp + pd.Timedelta(minutes=level_offset)).values
    for rainfall_offset in self.shifts['rainfall']:
      # record[i] = self.rainfall_random_access.get_timestamp(timestamp + pd.Timedelta(minutes=rainfall_offset))['value']
      rainfall_values = self.rainfall_random_access.get_timestamp(timestamp + pd.Timedelta(minutes=rainfall_offset)).values
    return np.concatenate([flow_values, level_values, rainfall_values])
    
    
dataset_builder = DatasetBuilder()

for record in dataset_builder.generate_data(pd.Timestamp('2020-01-01'), pd.Timestamp('2020-02-01')):
  print(record)
  break

[19.8    4.58  13.5    9.36   0.629  0.505  0.552  0.564  0.635  0.
  0.       nan]
