# Data Processing

I've heard rumours that the API output doesn't have data_block_id - so I'm going to have to look at other ways of joining the data in this notebook to make it API compatible,

In [13]:
import pandas as pd
import datetime as dt

## TODO: Find out the lag for each datetime

In [11]:
class TrainDataProcessor:
    """Processes Train data, using train data as a warm start, and prepares it for inference."""

    def __init__(self, train, revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices, weather_mapping):
        self.train = self.init_train(train)
        self.revealed_targets = self.init_revealed_targets(revealed_targets)
        self.client = self.init_client(client)
        self.historical_weather = self.init_historical_weather(historical_weather)
        self.forecast_weather = self.init_forecast_weather(forecast_weather)
        self.electricity_prices = self.init_electricity(electricity_prices)
        self.gas_prices = self.init_gas_prices(gas_prices)
        self.weather_mapping = self.init_weather_mapping(weather_mapping)
        
    def get_data_block_id(df, datetime_col):
        """
        Find data_block_id from date
        """
        base_date = dt.datetime(2021, 9, 1)
        
        df['data_block_id'] = df[datetime_col] 

        df = df.with_columns(
            (pl.col(datetime_col) - base_date).dt.days().alias("data_block_id")
        )

        return df
        
    def init_train(self, df):
        """
        Prepares the training data for model training."""
        df['datetime'] = pd.to_datetime(df.datetime)
        df['date'] = df.datetime.dt.date
        df = self.get_data_block_id(df, 'datetime')
        return df

    def init_electricity(self, df):
        ## LAG = 1 Day
        ## Move forecast datetime ahead by 1 day
        ## change name to datetime
        df['datetime'] = pd.to_datetime(df['forecast_date'])
        df = self.get_data_block_id(df, 'datetime')
        return df

    def init_historical_weather(self, df):
        ## LAG: From 11:00 AM 2 days ago to 10:00 AM 1 day ago
        ## What to do? Give most recent weather forecast? Give average over the last day?
        """
        Processes the historical weather data.
        Implement the logic to handle historical weather data processing here.
        """
        df['datetime'] = pd.to_datetime(df.datetime)
        df = self.get_data_block_id(df, 'datetime')
        return df

    def init_forecast_weather(self, df):
        ## LAG: DON't ADJUST
        ##      The forecast is from yesterday, but can forecast today, which is 22 hours ahead
        ## Drop any columns where:
        ##                        hours_ahead < 22 and hours_ahead > 45
        ## Then rename forecast_datetime to datetime and join on datetime
        """
        Processes the forecast weather data.
        Implement the logic to handle forecast weather data processing here.
        """
        df['datetime'] = pd.to_datetime(df['forecast_date'])
        df = self.get_data_block_id(df, 'datetime')
        return df

    def init_gas_prices(self, df):
        ## LAG: 1 DAY
        ## Predictions are made from 2 days ago and predict for yesterday
        ## add one day to forecast_date
        ## Rename forecast_date to date, join on date
        """
        Processes the gas prices data.
        Implement the logic to handle gas prices data processing here.
        """
        df['datetime'] = pd.to_datetime(df['forecast_date'])
        df = self.get_data_block_id(df, 'datetime')
        return df
    
    def init_revealed_targets(self, df):
        return df
    
    def init_client(self, df):
        ## LAG: 2 days
        ## Add 2 days to date, join on date
        df['date'] = pd.to_datetime(df.date).dt.date
        df = self.get_data_block_id(df, 'date')
        return df

    def init_weather_mapping(self, df):
        # https://www.kaggle.com/code/tsunotsuno/enefit-eda-baseline/notebook#Baseline
        county_point_map = {
            0: (59.4, 24.7), # "HARJUMAA"
            1 : (58.8, 22.7), # "HIIUMAA"
            2 : (59.1, 27.2), # "IDA-VIRUMAA"
            3 : (58.8, 25.7), # "JÄRVAMAA"
            4 : (58.8, 26.2), # "JÕGEVAMAA"
            5 : (59.1, 23.7), # "LÄÄNE-VIRUMAA"
            6 : (59.1, 23.7), # "LÄÄNEMAA"
            7 : (58.5, 24.7), # "PÄRNUMAA"
            8 : (58.2, 27.2), # "PÕLVAMAA"
            9 : (58.8, 24.7), # "RAPLAMAA"
            10 : (58.5, 22.7),# "SAAREMAA"
            11 : (58.5, 26.7),# "TARTUMAA"
            12 : (58.5, 25.2),# "UNKNOWNN" (center of the map)
            13 : (57.9, 26.2),# "VALGAMAA"
            14 : (58.2, 25.7),# "VILJANDIMAA"
            15 : (57.9, 27.2) # "VÕRUMAA"
        }
        # Convert the dictionary to a list of tuples
        data = [(county_code, lat, lon) for county_code, (lat, lon) in county_point_map.items()]

        # Create DataFrame
        df = pd.DataFrame(data, columns=['county', 'latitude', 'longitude'])
        
        return df

In [3]:
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_submission) in iter_test:
    """I want a data reading class that saves all the test data and loads it then processes it"""

You must call `predict()` successfully before you can continue with `iter_test()`


TypeError: cannot unpack non-iterable NoneType object

In [4]:
import os

In [6]:
os.curdir

'.'

In [7]:
os.listdir()

['.git',
 '.gitattributes',
 '.gitignore',
 '.ipynb_checkpoints',
 '1. Data Processing.ipynb',
 'data',
 'Ideas.md',
 'README.md',
 'WIP Data Exploration.ipynb',
 'WIP Data Loading.ipynb']