In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import settings

In [3]:
df_train = pd.read_csv("./data/no1_train.csv")
df_train.describe()

Unnamed: 0,hydro,micro,thermal,wind,river,total,y,sys_reg,flow
count,225088.0,225088.0,225088.0,225088.0,225088.0,225088.0,225088.0,225088.0,225088.0
mean,1888.334127,224.189826,21.803497,55.703266,0.0,2190.031963,8.907921,-8.284305,-1999.055081
std,401.000305,58.559855,3.579229,47.053033,0.0,444.07773,320.262539,43.748315,1311.682286
min,683.438,80.371,0.0,0.0,0.0,849.732,-1579.680903,-828.0,-5541.2
25%,1625.029,185.017,22.1,15.704,0.0,1916.523,-142.2267,0.0,-2996.7
50%,1933.916,225.938,22.1,41.848,0.0,2248.874,-11.282351,0.0,-1953.7
75%,2175.595,269.558,24.1,88.723,0.0,2497.792,127.454943,0.0,-906.9
max,2995.524,349.271,25.7,176.0,0.0,3351.974,2956.333317,474.0,723.4


# Data preprocessing

### Clamp

In [4]:
from preprocessing import filter_column_based_on_quantile

df_train = filter_column_based_on_quantile(df_train, 0.001, settings.COLUMNS_TO_CLAMP)
df_train.describe()

Unnamed: 0,hydro,micro,thermal,wind,river,total,y,sys_reg,flow
count,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0
mean,1888.924507,224.25934,21.801231,55.613476,0.0,2190.599804,7.496014,-8.279254,-2000.194678
std,400.195368,58.478465,3.581429,47.029993,0.0,443.252455,308.588482,43.75936,1311.949926
min,683.438,80.371,0.0,0.0,0.0,849.732,-997.412582,-828.0,-5541.2
25%,1626.367,185.091,22.1,15.6635,0.0,1917.963,-141.931476,0.0,-2998.9
50%,1934.012,226.002,22.1,41.728,0.0,2249.096,-11.282351,0.0,-1954.5
75%,2175.595,269.558,24.1,88.49,0.0,2497.792,127.095807,0.0,-907.52
max,2995.524,349.271,25.7,176.0,0.0,3351.974,2243.579521,474.0,643.6


### Normalizing (MinMax)

In [5]:
from preprocessing import normalize_columns

df_train = normalize_columns(df_train, settings.COLUMNS_TO_NORMALIZE)
df_train.describe()

Unnamed: 0,hydro,micro,thermal,wind,river,total,y,sys_reg,flow
count,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0
mean,0.521385,0.5351,0.848297,0.315986,0.0,0.535867,0.310062,0.629586,0.572534
std,0.173088,0.217473,0.139355,0.267216,0.0,0.177142,0.095214,0.033609,0.212125
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.407826,0.389438,0.859922,0.088997,0.0,0.42691,0.263957,0.635945,0.411056
50%,0.540886,0.541581,0.859922,0.237091,0.0,0.559244,0.304268,0.635945,0.579922
75%,0.645373,0.703559,0.937743,0.502784,0.0,0.658633,0.346964,0.635945,0.749205
max,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0


### 5.2.2 Avoiding Structural Imbalances Dataset

In [6]:
from preprocessing import avoid_structural_imbalance
import seaborn as sns


if settings.AVOID_STRUCTURAL_IMBALANCE:
    df_train = avoid_structural_imbalance(df_train)
    sns.lineplot(data=df_train[0:500], x=df_train.index[0:500], y='interpolation', label="interpolation")
    sns.lineplot(data=df_train[0:500], x=df_train.index[0:500], y='sum', label="sum")
    sns.lineplot(data=df_train[0:500], x=df_train.index[0:500], y='diff', label="diff")

### Add time features

In [7]:
from preprocessing import add_date_time_features

df_train = add_date_time_features(df_train)
df_train

Unnamed: 0,start_time,hydro,micro,thermal,wind,river,total,y,sys_reg,flow,...,time_of_year_2.0,time_of_year_3.0,time_of_year_4.0,time_of_year_5.0,time_of_year_6.0,time_of_year_7.0,time_of_year_8.0,time_of_year_9.0,time_of_year_10.0,time_of_year_11.0
0,2019-01-09 14:10:00,0.392892,0.228922,0.447471,0.422886,0.0,0.387638,0.413066,0.643625,0.181704,...,0,0,0,0,0,0,0,0,0,0
1,2019-01-09 14:15:00,0.392892,0.228922,0.447471,0.422886,0.0,0.387638,0.409605,0.643625,0.181704,...,0,0,0,0,0,0,0,0,0,0
2,2019-01-09 14:20:00,0.392892,0.228922,0.447471,0.422886,0.0,0.387638,0.407681,0.643625,0.181704,...,0,0,0,0,0,0,0,0,0,0
3,2019-01-09 14:25:00,0.392892,0.228922,0.447471,0.422886,0.0,0.387638,0.399215,0.643625,0.181704,...,0,0,0,0,0,0,0,0,0,0
4,2019-01-09 14:30:00,0.392892,0.228922,0.447471,0.422886,0.0,0.387638,0.390778,0.643625,0.181704,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225083,2021-03-01 03:05:00,0.544551,0.414689,0.782101,0.604722,0.0,0.566039,0.351445,0.635945,0.531674,...,1,0,0,0,0,0,0,0,0,0
225084,2021-03-01 03:10:00,0.544551,0.414689,0.782101,0.604722,0.0,0.566039,0.347565,0.635945,0.531674,...,1,0,0,0,0,0,0,0,0,0
225085,2021-03-01 03:15:00,0.544551,0.414689,0.782101,0.604722,0.0,0.566039,0.337033,0.635945,0.531674,...,1,0,0,0,0,0,0,0,0,0
225086,2021-03-01 03:20:00,0.544551,0.414689,0.782101,0.604722,0.0,0.566039,0.339403,0.635945,0.531674,...,1,0,0,0,0,0,0,0,0,0


### Lag features

In [8]:
# Previous y lag feature
df_train["y_prev"] = df_train["y"].shift(1)

# Add power imbalance from 24 hours ago
df_train["y_prev_24h"] = df_train["y"].shift(24*60//5)

# Mean power imbalance yesterday
df_train = pd.merge_asof(
    df_train,
    df_train.resample('D', on="start_time")["y"].mean().shift(1),
    right_index=True,
    left_on="start_time",
)
df_train = df_train.rename(columns={"y_x": "y", "y_y": "y_yesterday"})

df_train.describe()

Unnamed: 0,hydro,micro,thermal,wind,river,total,y,sys_reg,flow,time_of_day_0,...,time_of_year_5.0,time_of_year_6.0,time_of_year_7.0,time_of_year_8.0,time_of_year_9.0,time_of_year_10.0,time_of_year_11.0,y_prev,y_prev_24h,y_yesterday
count,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,...,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224636.0,224635.0,224348.0,224518.0
mean,0.521385,0.5351,0.848297,0.315986,0.0,0.535867,0.310062,0.629586,0.572534,0.041627,...,0.076715,0.079489,0.079395,0.076301,0.079195,0.076867,0.079489,0.310062,0.309971,0.310272
std,0.173088,0.217473,0.139355,0.267216,0.0,0.177142,0.095214,0.033609,0.212125,0.199737,...,0.26614,0.2705,0.270355,0.26548,0.270043,0.26638,0.2705,0.095214,0.095225,0.083089
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076906
25%,0.407826,0.389438,0.859922,0.088997,0.0,0.42691,0.263957,0.635945,0.411056,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.263956,0.263902,0.27239
50%,0.540886,0.541581,0.859922,0.237091,0.0,0.559244,0.304268,0.635945,0.579922,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.304268,0.304173,0.304192
75%,0.645373,0.703559,0.937743,0.502784,0.0,0.658633,0.346964,0.635945,0.749205,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346964,0.346825,0.341379
max,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.895248


In [9]:
# Drop the NAs
df_train = df_train.dropna()
df_train.describe()


Unnamed: 0,hydro,micro,thermal,wind,river,total,y,sys_reg,flow,time_of_day_0,...,time_of_year_5.0,time_of_year_6.0,time_of_year_7.0,time_of_year_8.0,time_of_year_9.0,time_of_year_10.0,time_of_year_11.0,y_prev,y_prev_24h,y_yesterday
count,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,...,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0,224348.0
mean,0.521496,0.535501,0.848811,0.3161,0.0,0.536026,0.309972,0.629581,0.572972,0.041627,...,0.076814,0.079591,0.079497,0.076399,0.079296,0.076965,0.079591,0.309972,0.309971,0.310234
std,0.173103,0.217324,0.138702,0.26734,0.0,0.177139,0.095236,0.033619,0.211899,0.199736,...,0.266296,0.270659,0.270514,0.265636,0.270201,0.266537,0.270659,0.095237,0.095225,0.083109
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076906
25%,0.408029,0.389974,0.859922,0.088906,0.0,0.427321,0.263902,0.635945,0.411816,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.263902,0.263902,0.27239
50%,0.541056,0.54203,0.859922,0.237091,0.0,0.559356,0.304156,0.635945,0.58037,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.304156,0.304173,0.304192
75%,0.645602,0.703886,0.937743,0.503375,0.0,0.658826,0.346789,0.635945,0.749424,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34679,0.346825,0.341379
max,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.895248


### Drop columns we don't want to use

In [10]:
df_train = df_train.drop(columns=settings.COLUMNS_TO_DROP)
df_train

Unnamed: 0,hydro,micro,thermal,wind,total,y,sys_reg,flow,time_of_day_0,time_of_day_1,...,time_of_year_5.0,time_of_year_6.0,time_of_year_7.0,time_of_year_8.0,time_of_year_9.0,time_of_year_10.0,time_of_year_11.0,y_prev,y_prev_24h,y_yesterday
288,0.595521,0.216017,0.447471,0.359159,0.568999,0.423420,0.635945,0.210969,0,0,...,0,0,0,0,0,0,0,0.423878,0.413066,0.360384
289,0.595521,0.216017,0.447471,0.359159,0.568999,0.421468,0.635945,0.210969,0,0,...,0,0,0,0,0,0,0,0.423420,0.409605,0.360384
290,0.595521,0.216017,0.447471,0.359159,0.568999,0.420343,0.635945,0.210969,0,0,...,0,0,0,0,0,0,0,0.421468,0.407681,0.360384
291,0.595521,0.216017,0.447471,0.359159,0.568999,0.412028,0.635945,0.210969,0,0,...,0,0,0,0,0,0,0,0.420343,0.399215,0.360384
292,0.595521,0.216017,0.447471,0.359159,0.568999,0.408378,0.635945,0.210969,0,0,...,0,0,0,0,0,0,0,0.412028,0.390778,0.360384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225083,0.544551,0.414689,0.782101,0.604722,0.566039,0.351445,0.635945,0.531674,0,0,...,0,0,0,0,0,0,0,0.351262,0.325414,0.383033
225084,0.544551,0.414689,0.782101,0.604722,0.566039,0.347565,0.635945,0.531674,0,0,...,0,0,0,0,0,0,0,0.351445,0.330666,0.383033
225085,0.544551,0.414689,0.782101,0.604722,0.566039,0.337033,0.635945,0.531674,0,0,...,0,0,0,0,0,0,0,0.347565,0.328170,0.383033
225086,0.544551,0.414689,0.782101,0.604722,0.566039,0.339403,0.635945,0.531674,0,0,...,0,0,0,0,0,0,0,0.337033,0.327855,0.383033


Move y_prev to the last column, this will be useful later.

In [11]:
df_train = df_train[[c for c in df_train if c not in ['y_prev']] 
       + ['y_prev']]

### Save to .csv

In [12]:

df_train.to_csv("./data/train_dataset.csv", index=False)