In [235]:
import load_data
import os
from netCDF4 import Dataset

In [3]:
import pandas as pd
import numpy as np

In [326]:
obs_names = ['psur_obs', 't2m_obs', 'q2m_obs', 'w10m_obs', 'd10m_obs', 'rh2m_obs', 'u10m_obs', 'v10m_obs', 'RAIN_obs']
m_names = ['q2m_M', 'rh2m_M', 'w10m_M', 'd10m_M', 'u10m_M', 'v10m_M', 'SWD_M', 'GLW_M', 'HFX_M',
           'LH_M', 'RAIN_M', 'PBLH_M', 'TC975_M', 'TC925_M', 'TC850_M', 'TC700_M', 'TC500_M', 'wspd975_M', 'wspd925_M',
           'wspd850_M', 'wspd700_M', 'wspd500_M', 'Q975_M', 'Q925_M', 'Q850_M', 'Q700_M', 'Q500_M']

In [304]:
explain = pd.read_csv(os.path.join('data', 'explain.csv'), index_col=0).dropna(how='any', axis=1)

In [246]:
def get_serial_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fetch data with serial datetime.
    """
    df_series = pd.DataFrame()
    for _, group in df.groupby(['station_id', 'date']):
        df_series = pd.concat([df_series, group.iloc[:24]])
    date_index = pd.Series()
    for station_id, group in df_series.groupby('station_id'):
        index = pd.date_range(group.loc[station_id].index[0][0], group.loc[station_id].index[-1][0] + pd.DateOffset(hours=23), freq='H')
        index = index.to_series()
        date_index = pd.concat([date_index, index])
    df_series.index = df_series.index.droplevel([1, 2])
    df_series = df_series.set_index(date_index, append=True)
    return df_series

In [238]:
def load_raw_file(file_dir: str) -> pd.DataFrame:
    """
    Load data from original netCDF4 file.

    :return: one pd.DataFrame containing all data in file.
    """
    print("Loading netCDF4 file", file_dir)
    data = Dataset(file_dir)
    date_list = [pd.Timestamp.strptime(str(int(this_date)), '%Y%m%d%H') for this_date in data['date']]
    station_list = list(data['station'][:])
    feature_list = list(data.variables.keys())[3:]

    date_index = []
    for date in date_list:
        date_index += [date] * data['foretimes'][:].shape[0]
    foretime_index = list(data['foretimes'][:]) * data['date'][:].shape[0]

    df_list = []
    for i in range(len(station_list)):
        print("Loading station", station_list[i])
        df = pd.DataFrame()
        df['date'] = date_index
        df['foretime'] = foretime_index
        for feature in feature_list:
            feature_df = pd.DataFrame(data[feature][:, :, i].reshape(-1, 1), columns=[feature])
            df = pd.concat([df, feature_df], axis=1)
        df['station_id'] = station_list[i]
        df_list.append(df)
    result = pd.concat(df_list)
    result = result.set_index(['station_id', 'date', 'foretime'])
    return result

In [239]:
tr_raw = load_raw_file(os.path.join('data', 'ai_challenger_wf2018_trainingset_20150301-20180531.nc'))

Loading netCDF4 file data/ai_challenger_wf2018_trainingset_20150301-20180531.nc
Loading station 90001
Loading station 90002
Loading station 90003
Loading station 90004
Loading station 90005
Loading station 90006
Loading station 90007
Loading station 90008
Loading station 90009
Loading station 90010


In [240]:
va_raw = load_raw_file(os.path.join('data', 'ai_challenger_wf2018_validation_20180601-20180828_20180905.nc'))

Loading netCDF4 file data/ai_challenger_wf2018_validation_20180601-20180828_20180905.nc
Loading station 90001
Loading station 90002
Loading station 90003
Loading station 90004
Loading station 90005
Loading station 90006
Loading station 90007
Loading station 90008
Loading station 90009
Loading station 90010


In [15]:
def fill_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill data using basic linear insertion.
    """
    return df.groupby(level=[0, 1]).apply(lambda group: group.interpolate(method='linear', limit=3, 
                                                                          limit_direction='both'))

In [6]:
def drop_empty_day(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop days where there is empty data.
    """
    for name in df.columns:
        empty_index_list = list(df.loc[df[name].apply(np.isnan)].groupby(['station_id', 'date']).groups.keys())
        if len(empty_index_list) > 0:
            df = df.drop(index=empty_index_list)
    return df

In [308]:
def normalize(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize data.
    """
    result = pd.DataFrame(df)
    for name in df.columns:
        scope = explain.loc[name]['scope'][1:-1].split(',')
        min_value = float(scope[0].strip())
        max_value = float(scope[1].strip())
        
        result[name] = (result[name] - min_value) / (max_value - min_value)
    return result

In [7]:
tr_fill = fill_data(tr_raw)

In [241]:
va_fill = fill_data(va_raw)

In [8]:
tr_drop = drop_empty_day(tr_fill)

In [242]:
va_drop = drop_empty_day(va_fill)

In [398]:
tr_n = normalize(tr_drop)

In [321]:
tr_fill

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,psfc_M,t2m_M,q2m_M,rh2m_M,w10m_M,d10m_M,u10m_M,v10m_M,SWD_M,GLW_M,...,Q500_M,psur_obs,t2m_obs,q2m_obs,w10m_obs,d10m_obs,rh2m_obs,u10m_obs,v10m_obs,RAIN_obs
station_id,date,foretime,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
90001,2015-03-01 03:00:00,0,,,,,,,,,,,...,,1018.500000,4.000000,2.143617,0.80,89.0,43.0,-0.799878,-0.013962,0.0
90001,2015-03-01 03:00:00,1,,,,,,,,,,,...,,1017.599976,5.800000,2.264909,0.40,130.0,40.0,-0.306418,0.257115,0.0
90001,2015-03-01 03:00:00,2,,,,,,,,,,,...,,1015.799988,8.000000,2.043311,1.50,251.0,31.0,1.418278,0.488352,0.0
90001,2015-03-01 03:00:00,3,,,,,,,,,,,...,,1014.400024,10.000000,1.594945,3.70,264.0,21.0,3.679731,0.386757,0.0
90001,2015-03-01 03:00:00,4,,,,,,,,,,,...,,1013.400024,10.700000,1.511350,2.90,237.0,19.0,2.432145,1.579453,0.0
90001,2015-03-01 03:00:00,5,,,,,,,,,,,...,,1012.900024,11.000000,1.461535,3.70,247.0,18.0,3.405868,1.445706,0.0
90001,2015-03-01 03:00:00,6,,,,,,,,,,,...,,1012.500000,10.400000,1.406877,3.10,237.0,18.0,2.599879,1.688381,0.0
90001,2015-03-01 03:00:00,7,,,,,,,,,,,...,,1012.700012,8.600000,1.449401,1.60,182.0,21.0,0.055839,1.599025,0.0
90001,2015-03-01 03:00:00,8,,,,,,,,,,,...,,1013.099976,3.700000,2.155057,1.60,61.0,44.0,-1.399392,-0.775696,0.0
90001,2015-03-01 03:00:00,9,,,,,,,,,,,...,,1013.500000,3.300000,2.096401,1.10,74.0,44.0,-1.057388,-0.303201,0.0


In [310]:
tr_n

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,psfc_M,t2m_M,q2m_M,rh2m_M,w10m_M,d10m_M,u10m_M,v10m_M,SWD_M,GLW_M,...,Q500_M,psur_obs,t2m_obs,q2m_obs,w10m_obs,d10m_obs,rh2m_obs,u10m_obs,v10m_obs,RAIN_obs
station_id,date,foretime,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
90001,2015-03-01 03:00:00,0,,,,,,,,,,,...,,0.6740,0.463158,0.071454,0.026667,0.247222,0.43,0.486669,0.499767,0.0
90001,2015-03-01 03:00:00,1,,,,,,,,,,,...,,0.6704,0.482105,0.075497,0.013333,0.361111,0.40,0.494893,0.504285,0.0
90001,2015-03-01 03:00:00,2,,,,,,,,,,,...,,0.6632,0.505263,0.068110,0.050000,0.697222,0.31,0.523638,0.508139,0.0
90001,2015-03-01 03:00:00,3,,,,,,,,,,,...,,0.6576,0.526316,0.053165,0.123333,0.733333,0.21,0.561329,0.506446,0.0
90001,2015-03-01 03:00:00,4,,,,,,,,,,,...,,0.6536,0.533684,0.050378,0.096667,0.658333,0.19,0.540536,0.526324,0.0
90001,2015-03-01 03:00:00,5,,,,,,,,,,,...,,0.6516,0.536842,0.048718,0.123333,0.686111,0.18,0.556764,0.524095,0.0
90001,2015-03-01 03:00:00,6,,,,,,,,,,,...,,0.6500,0.530526,0.046896,0.103333,0.658333,0.18,0.543331,0.528140,0.0
90001,2015-03-01 03:00:00,7,,,,,,,,,,,...,,0.6508,0.511579,0.048313,0.053333,0.505556,0.21,0.500931,0.526650,0.0
90001,2015-03-01 03:00:00,8,,,,,,,,,,,...,,0.6524,0.460000,0.071835,0.053333,0.169444,0.44,0.476677,0.487072,0.0
90001,2015-03-01 03:00:00,9,,,,,,,,,,,...,,0.6540,0.455789,0.069880,0.036667,0.205556,0.44,0.482377,0.494947,0.0


In [9]:
def check_empty_day(df: pd.DataFrame) -> pd.DataFrame:
    result = set()
    for name in df.columns:
        empty_index_set = set(df.loc[df[name].apply(np.isnan)].groupby(['station_id', 'date']).groups.keys())
        result |= empty_index_set
    result = pd.DataFrame(list(result), columns=['station_id', 'date'])
    result = result.sort_values(by=['station_id', 'date'])
    result = result.set_index('station_id')
    return result

In [10]:
all_empty = check_empty_day(tr_fill)

In [11]:
all_empty.to_csv('empty_dates.csv')
all_empty.to_pickle('empty_dates.pkl')

In [12]:
tr_fill.to_pickle('tr_fill.pkl')

In [13]:
tr_drop.to_pickle('tr_drop.pkl')

In [49]:
print(m_names)

['psfc_M', 't2m_M', 'q2m_M', 'rh2m_M', 'w10m_M', 'd10m_M', 'u10m_M', 'v10m_M', 'SWD_M', 'GLW_M', 'HFX_M', 'LH_M', 'RAIN_M', 'PBLH_M', 'TC975_M', 'TC925_M', 'TC850_M', 'TC700_M', 'TC500_M', 'wspd975_M', 'wspd925_M', 'wspd850_M', 'wspd700_M', 'wspd500_M', 'Q975_M', 'Q925_M', 'Q850_M', 'Q700_M', 'Q500_M']


In [212]:
def generate_one_set(station_id: int, forecast_date: pd.Timestamp, df: pd.DataFrame, previous_days=1):
    """
    Use forecast date's data and previous date data to concat a set of training data.
    """
    history_list = []
    for i in range(previous_days, 0, -1):
        history_list.append(df.loc[station_id, forecast_date - pd.DateOffset(days=i)].iloc[:24])
    history = pd.concat(history_list + [df.loc[station_id, forecast_date]])

    history_obs = history.iloc[:(24*previous_days + 4)][obs_names]
    history_m = history[m_names]

    prediction = history.iloc[(24*previous_days + 4):][['t2m_obs', 'rh2m_obs', 'w10m_obs']]

    return history_obs, history_m, prediction

In [219]:
try:
    a, b, c = generate_one_set(90001, pd.Timestamp(2015, 3, 27, 3), tr_drop, previous_days=2)
except KeyError:
    print("Contains empty data.")

In [224]:
def generate_history_obs_data(obs_df: pd.DataFrame) -> pd.Series:
    """
    Use observation history data frame generated by function generate_one_set() 
    to construct training data.
    """
    return generate_series_data(obs_df, len(obs_names))

In [190]:
def generate_history_m_data(m_df: pd.DataFrame) -> pd.Series:
    """
    Use Ruitu history data frame genrated by function generate_one_set()
    to construct training data.
    """
    return generate_series_data(m_df, len(m_names))

In [206]:
def generate_series_data(df: pd.DataFrame, column_length: int) -> pd.Series:
    df_max = df.max()
    df_max.index = df_max.index + '_max'
    df_min = df.min()
    df_min.index = df_min.index + '_min'
    df_mean = df.mean()
    df_mean.index = df_mean.index + '_mean'
    df_var = df.var()
    df_var.index = df_var.index + '_var'
    
    df_series = df.stack()
    series_index = np.array([[i] * column_length for i in range(int(len(df_series.index) / column_length))]).reshape(1, -1)[0].astype(str)
    df_series.index = df_series.index.get_level_values(1) + '_' + series_index
    
    return pd.concat([df_series, df_max, df_min, df_mean, df_var])

In [111]:
holiday = ['2017-01-01', '2017-01-02', '2017-01-27', '2017-01-28', '2017-01-29',
           '2017-01-30', '2017-01-31', '2017-02-01', '2017-02-02', '2017-04-02',
           '2017-04-03', '2017-04-04', '2017-04-29', '2017-04-30', '2017-05-01',
           '2017-05-28', '2017-05-29', '2017-05-30', '2017-10-01', '2017-10-02',
           '2017-10-03', '2017-10-04', '2017-10-05', '2017-10-06', '2017-10-07',
           '2017-10-08', '2017-12-30', '2017-12-31',
           '2018-01-01', '2018-02-15', '2018-02-16', '2018-02-17', '2018-02-18',
           '2018-02-19', '2018-02-20', '2018-02-21', '2018-04-05', '2018-04-06',
           '2018-04-07', '2018-04-29', '2018-04-30', '2018-05-01', '2018-06-16',
           '2018-06-17', '2018-06-18']
work = ['2017-01-22', '2017-02-04', '2017-04-01', '2017-05-27', '2017-09-30',
        '2018-02-11', '2018-02-24', '2018-04-08', '2018-04-28']

In [367]:
def generate_date_feature(date: pd.Timestamp):
    """
    Generate date features.
    """
    result = pd.Series()
    result['timestamp'] = date.timestamp()
    
    format_string = '%Y-%m-%d'
    dt_string = date.strftime(format_string)
    result['holiday'] = int((dt_string in holiday) or (date.weekday() in [5, 6] and dt_string not in work))
    
    result = pd.concat([result, get_onehot(date.weekday(), 0, 6, name='weekday')])
    return result

In [251]:
def generate_stat_feature(station_id: int, forecast_date: pd.Timestamp, df: pd.DataFrame, days):
    """
    Using assigned length of history data to fetch statistic features.
    """
    history_list = []
    for i in range(days, 0, -1):
        try:
            history_list.append(df.loc[station_id, forecast_date - pd.DateOffset(days=i)].iloc[:24])
        except KeyError:
            pass
    h = pd.concat(history_list + [df.loc[station_id, forecast_date]])
    
    h_max = h.max()
    h_min = h.min()
    h_mean = h.mean()
    h_var = h.var()
    
    h_max.index = h_max.index + '_{}days_max'.format(days)
    h_min.index = h_min.index + '_{}days_min'.format(days)
    h_mean.index = h_mean.index + '_{}days_mean'.format(days)
    h_var.index = h_var.index + '_{}days_var'.format(days)
    
    return pd.concat([h_max, h_min, h_mean, h_var])

In [373]:
def get_onehot(value, min_value, max_value, step=1, name="") -> pd.Series:
    length = int((max_value - min_value) / step) + 1
    result = np.zeros([length])
    index = int((value - min_value) / step)
    result[index] = 1
    
    name_list = [name + "_{}".format(i) for i in range(length)]
    result = pd.Series(result, index=name_list)
    return result

In [387]:
def generate_x(station_id: int, forecast_date: pd.Timestamp, df, days=1):
    history_obs, history_m, prediction = generate_one_set(station_id, forecast_date, df, days)
    obs_s = generate_history_obs_data(history_obs)
    m_s = generate_history_m_data(history_m)
    date_f = generate_date_feature(forecast_date)
    week_s = generate_stat_feature(station_id, forecast_date, df, 7)
    month_s = generate_stat_feature(station_id, forecast_date, df, 30)
    
    return pd.concat([obs_s, m_s, date_f, week_s, month_s]), prediction

In [318]:
def denormalize(df: pd.DataFrame):
    result = pd.DataFrame(df)
    for name in df.columns:
        scope = explain.loc[name]['scope'][1:-1].split(',')
        min_value = float(scope[0].strip())
        max_value = float(scope[1].strip())
        
        result[name] = result[name] * (max_value - min_value) + min_value
    return result

In [399]:
X_1, Y_1 = generate_x(90001, pd.Timestamp(2015, 3, 27, 3), tr_n, 2)
X_2, Y_2 = generate_x(90001, pd.Timestamp(2015, 3, 28, 3), tr_n, 2)

In [400]:
pd.concat([X_1, X_2], axis=1).transpose()

Unnamed: 0,psur_obs_0,t2m_obs_0,q2m_obs_0,w10m_obs_0,d10m_obs_0,rh2m_obs_0,u10m_obs_0,v10m_obs_0,RAIN_obs_0,psur_obs_1,...,Q500_M_30days_var,psur_obs_30days_var,t2m_obs_30days_var,q2m_obs_30days_var,w10m_obs_30days_var,d10m_obs_30days_var,rh2m_obs_30days_var,u10m_obs_30days_var,v10m_obs_30days_var,RAIN_obs_30days_var
0,0.714,0.549474,0.092118,0.06,0.622222,0.32,0.52084,0.52158,0.0,0.7116,...,0.000117,0.000708,0.004668,0.001659,0.003001,0.086213,0.039746,0.000913,0.001363,0.0
1,0.7064,0.592632,0.117076,0.056667,0.725,0.31,0.527984,0.504432,0.0,0.7044,...,0.000132,0.000714,0.004873,0.001762,0.002978,0.084333,0.038904,0.00093,0.001357,0.0
