In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
from load_data import *

In [3]:
t_raw = pd.read_pickle(os.path.join('dataframes', 'train_raw.pkl'))
v_raw = pd.read_pickle(os.path.join('dataframes', 'validation_raw.pkl'))
test_a1 = pd.read_pickle(os.path.join('dataframes', 'test_a1.pkl'))
a_raw = pd.read_pickle(os.path.join('dataframes', 'all_raw.pkl'))

In [4]:
def fill_nan_with_m(df: pd.DataFrame):
    """
    Use M data to fill all the NaNs in observation data.
    """
    obs_columns = ['psur_obs', 't2m_obs', 'q2m_obs', 'rh2m_obs', 'w10m_obs', 
                   'd10m_obs', 'u10m_obs', 'v10m_obs', 'RAIN_obs']
    m_columns = ['psfc_M', 't2m_M', 'q2m_M', 'rh2m_M', 'w10m_M', 'd10m_M',
                'u10m_M', 'v10m_M', 'RAIN_M']
    result = pd.DataFrame(df, copy=True)
    for (obs_column, m_column) in zip(obs_columns, m_columns):
        result[obs_column].fillna(result[m_column], inplace=True)
    return result

In [5]:
def generate_serial(station_id, end_date, df, prediction=False):
    end_date = pd.Timestamp(end_date)
    if prediction:
        tolerate = -33
    result = df.loc[station_id, end_date]
    result = result.interpolate(method='linear', limit=2, limit_direction='both')
    result = fill_nan_with_m(result)
    if prediction:
        assert ~result.iloc[:4].isnull().values.any()
    else:
        assert ~result.isnull().values.any()
    day = 1
    try:
        while True:
            to_append = df.loc[station_id, end_date - pd.DateOffset(days=day)].iloc[:24]
            result = pd.concat([to_append, result]) 
            result = check_invalid(result)
            result = result.interpolate(method='linear', limit=2, limit_direction='both')
            result = result.fill_nan_with_m(result)
            if prediction:
                assert ~result.iloc[13:-33].isnull().values.any()
            else:
                assert ~result.iloc[13:].isnull().values.any()
            day += 1
    except AssertionError:
        print('Reaching days with nan at', end_date - pd.DateOffset(days=day))
        result = result.iloc[13 + 24:]
        result = normalize(result)
        result.index = range(result.shape[0])
        return result.T
    except KeyError:
        print('Return without meeting any empty day.')
        result = result.iloc[13:]
        result = normalize(result)
        result.index = range(result.shape[0])
        return result.T

In [6]:
def generate_serial_predict(df, late_date=None):
    result_list = []
    if last_date is None:
        last_date = df.iloc[-1].name[1]
    else:
        last_date = pd.Timestamp(last_date)
    for station in range(90001, 90011):
        result_list.append(generate_serial(station, last_date, df, True).T.iloc[-108:].T)
    return pd.concat(result_list)

In [7]:
def check_invalid(df: pd.DataFrame) -> pd.DataFrame:
    """
    Check all values that exceeds valid scope and turn them into np.nan.
    """
    result = pd.DataFrame(df, copy=True)
    for name in df.columns:
        scope = explain.loc[name]['scope'][1:-1].split(',')
        min_value = float(scope[0].strip())
        max_value = float(scope[1].strip())
        
        result.loc[result[name] < min_value, name] = np.nan
        result.loc[result[name] > max_value, name] = np.nan
    return result