In [169]:
import os
import numpy as np
import pandas as pd
from collections import Counter
import time
import sys
import warnings

#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

#graph, plots
import matplotlib.pyplot as plt
import seaborn as sns

#building models
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import roc_auc_score, roc_curve


warnings.simplefilter(action='ignore', category=FutureWarning)

In [259]:
"""НЕ ЗАБУДЬТЕ:
- отсортировать df по дате.
- перевести значения колонки в int or float.
"""


def occ(x: list):
    c = Counter(x)
    c = dict(sorted(c.items(), key=lambda item: item[1]))
    res = list(c.keys())[-1]

    return res


def days_agg(dataframe: pd.DataFrame, column_name: str, agg_func: str, days: int):
    """Агрегирует данные в выбранной колонке за заданное колчество дней. По сути юзер-френдли 'скользящее окно'.

    params agg_func: модет принимать значение ['mean', 'std', 'sum', 'amplitude', 'occ'].
        - amplitude: max(value) - min(value)
        - occ: максимальное встречающееся значение
            [2,3,4,2,2,2,3]: {2: 4, 3: 2, 4: 1} => вернет 2, максимально встречающееся значение
    params days: ширина окна
    params column_name: что агрегировать
    params dataframe: dataframe

    return: ...
    """

    d = {
        'mean': np.mean,
        'sum': np.sum,
        'std': np.std,
        'amplitude': lambda x: np.max(x) - np.min(x),
        'occ': occ
    }

    agg_f = d[agg_func]
    values = np.array(dataframe[column_name])
    result = []

    for i in range(0, len(values)):
        i_ = i - days
        if i_ < 0:
            i_ = 0
        
        result.append(agg_f(values[i_:i+1]))
    
    return result


In [145]:
def _ts_to_table(idx, time_series, window_size):
    """ Method convert time series to lagged form.
    :param idx: the indices of the time series to convert
    :param time_series: source time series
    :param window_size: size of sliding window, which defines lag
    :return updated_idx: clipped indices of time series
    :return features_columns: lagged time series feature table
    """

    # Convert data to lagged form
    lagged_dataframe = pd.DataFrame({'t_id': time_series})
    vals = lagged_dataframe['t_id']
    for i in range(1, window_size + 1):
        frames = [lagged_dataframe, vals.shift(i)]
        lagged_dataframe = pd.concat(frames, axis=1)

    # Remove incomplete rows
    lagged_dataframe.dropna(inplace=True)

    transformed = np.array(lagged_dataframe)

    # Generate dataset with features
    features_columns = transformed[:, 1:]
    features_columns = np.fliplr(features_columns)

    return idx, features_columns

def convert_wated_codes(value):
    values = list(map(int, map(lambda x: x.strip(), value.split(','))))
    res = 0
    
    for val in values:
        res += water_codes[water_codes['water_code'] == val].reset_index(drop=True).iloc[0][1]
    
    return res
    

In [317]:
meteo_prep = pd.read_csv('../data/meteo_data/no_gap_meteo_3hour_int_3029_wind.csv')
meteo_prep.drop(['station_id'], inplace=True, axis=1)
meteo_prep

Unnamed: 0,wind_direction,wind_speed_aver,precipitation,date
0,16.0,0.2,0.11,1985-01-01
1,0.0,0.0,0.00,1985-01-02
2,30.0,0.1,0.00,1985-01-03
3,27.0,0.1,0.02,1985-01-04
4,61.0,0.2,0.00,1985-01-05
...,...,...,...,...
12778,114.0,0.5,0.08,2019-12-27
12779,47.0,0.3,0.04,2019-12-28
12780,126.0,1.3,0.04,2019-12-29
12781,160.9,0.9,0.05,2019-12-30


In [311]:
meteo = pd.read_csv('../data/meteo_data/no_gap_meteo_1day_int_3019.csv')
meteo.drop(['station_id'], inplace=True, axis=1)
meteo.columns = ['date', 'snow_coverage_station', 'snow_height']
meteo

Unnamed: 0,date,snow_coverage_station,snow_height
0,1985-01-01,10.0,41.0
1,1985-01-02,10.0,41.0
2,1985-01-03,10.0,41.0
3,1985-01-04,10.0,41.5
4,1985-01-05,10.0,41.5
...,...,...,...
12717,2019-10-27,10.0,1.5
12718,2019-10-28,10.0,1.5
12719,2019-10-29,10.0,5.5
12720,2019-10-30,5.0,0.0


In [312]:
water_codes = pd.read_csv('../data/misc/ref_code_hazard.csv')
water_codes.head()

Unnamed: 0,water_code,hazard
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [313]:
ids = [3019, 3027, 3028, 3030, 3035, 3041, 3045, 3230, 3050, 3029]

In [344]:
train = pd.read_csv('../data/2nd_checkpoint/sub_datasets_no_gaps/no_gaps/no_gap_train_3029.csv')
train = train.set_index('date')
train['water_hazard'] = train['water_code'].fillna('1').apply(convert_wated_codes)
train.drop(['stage_min', 'stage_max', 'temp', 'water_code', 'station_id', 'ice_thickness',
            'snow_height', 'place', 'year', 'month', 'day', 'delta_stage_max'], axis=1, inplace=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12722 entries, 1985-01-01 to 2019-10-31
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   stage_avg     12722 non-null  float64
 1   discharge     12722 non-null  float64
 2   water_hazard  12722 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 397.6+ KB


In [345]:
columns = ['0_day', '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day']
target_column = 'stage_avg'

a = train[target_column]
window_size = 8
idx = np.arange(train.shape[0] - window_size) + 1
idx, b = _ts_to_table(idx, a, window_size)

train = train.iloc[:-window_size, :]

new_df = pd.DataFrame(data=b, columns=columns, index=a.index[:-window_size])
new_df = pd.concat([train, new_df], axis=1, join='inner')
new_df.drop(['0_day'], axis=1, inplace=True)

new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12714 entries, 1985-01-01 to 2019-10-23
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   stage_avg     12714 non-null  float64
 1   discharge     12714 non-null  float64
 2   water_hazard  12714 non-null  int64  
 3   1_day         12714 non-null  float64
 4   2_day         12714 non-null  float64
 5   3_day         12714 non-null  float64
 6   4_day         12714 non-null  float64
 7   5_day         12714 non-null  float64
 8   6_day         12714 non-null  float64
 9   7_day         12714 non-null  float64
dtypes: float64(9), int64(1)
memory usage: 1.1+ MB


In [346]:
new_df = pd.merge(meteo_prep, new_df, how='inner', on=['date'])
new_df = pd.merge(meteo, new_df, how='inner', on=['date'])
new_df.drop(['wind_direction', 'wind_speed_aver'], axis=1, inplace=True)
new_df

Unnamed: 0,date,snow_coverage_station,snow_height,precipitation,stage_avg,discharge,water_hazard,1_day,2_day,3_day,4_day,5_day,6_day,7_day
0,1985-01-01,10.0,41.0,0.11,51.0,999.0,0,49.0,47.0,45.0,43.0,41.0,39.0,37.0
1,1985-01-02,10.0,41.0,0.00,49.0,996.0,0,47.0,45.0,43.0,41.0,39.0,37.0,36.0
2,1985-01-03,10.0,41.0,0.00,47.0,994.0,0,45.0,43.0,41.0,39.0,37.0,36.0,35.0
3,1985-01-04,10.0,41.5,0.02,45.0,992.0,0,43.0,41.0,39.0,37.0,36.0,35.0,33.0
4,1985-01-05,10.0,41.5,0.00,43.0,990.0,0,41.0,39.0,37.0,36.0,35.0,33.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12709,2019-10-19,0.0,0.0,0.00,84.0,2530.0,1,79.0,76.0,72.0,68.0,67.0,67.0,53.0
12710,2019-10-20,10.0,0.5,1.71,79.0,2470.0,1,76.0,72.0,68.0,67.0,67.0,53.0,42.0
12711,2019-10-21,5.0,0.0,0.23,76.0,2430.0,1,72.0,68.0,67.0,67.0,53.0,42.0,37.0
12712,2019-10-22,10.0,0.0,0.03,72.0,2260.0,0,68.0,67.0,67.0,53.0,42.0,37.0,40.0


In [349]:
def feature_aggregation(new_df):
    new_df['discharge_mean'] = days_agg(new_df, 'discharge', 'mean', 4)
    new_df['stage_avg_amplitude'] = days_agg(new_df, 'stage_avg', 'amplitude', 7)
    new_df['stage_avg_mean'] = days_agg(new_df, 'stage_avg', 'mean', 4)
    new_df['snow_coverage_station_amplitude'] = days_agg(new_df, 'snow_coverage_station', 'amplitude', 7)
    new_df['snow_height_mean'] = days_agg(new_df, 'snow_height', 'mean', 4)
    new_df['snow_height_amplitude'] = days_agg(new_df, 'snow_height', 'amplitude', 7)
    new_df['precipitation_sum'] = days_agg(new_df, 'precipitation', 'sum', 30)
    new_df['water_hazard_sum'] = days_agg(new_df, 'water_hazard', 'sum', 2)
    new_df.drop(['snow_coverage_station', 'snow_height', 'precipitation', 'stage_avg', 'discharge', 'water_hazard'], axis=1, inplace=True)
    
    return new_df


In [350]:
sample_sub_1 = pd.read_csv('../submissions/sample_submissions/sample_sub_2.csv')
sample_sub_1

Unnamed: 0,year,station_id,month,day,date,delta_stage_max
0,1993,3019,4,111,1993-04-21,0
1,1993,3019,4,112,1993-04-22,0
2,1993,3019,4,113,1993-04-23,0
3,1993,3019,4,114,1993-04-24,0
4,1993,3019,4,115,1993-04-25,0
...,...,...,...,...,...,...
1234,2013,3230,5,134,2013-05-14,0
1235,2013,3230,5,135,2013-05-15,0
1236,2013,3230,5,136,2013-05-16,0
1237,2013,3230,5,137,2013-05-17,0


In [379]:
predictions = []
columns = ['1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day']

for index in range(0, len(sample_sub_1['date']), 7):
    mini_df = new_df[new_df['date'] < sample_sub_1['date'][index]]
    mini_df = feature_aggregation(mini_df)
    mini_df.drop(['date'], inplace=True, axis=1)

    composer_params = {'max_depth': 5,
                       'max_arity': 7,
                       'pop_size': 5,
                       'num_of_generations': 20,
                       'learning_time': 10}
    
    features = np.array(mini_df.drop(columns, axis=1))
    features_test = np.array(mini_df.drop(columns, axis=1).iloc[-1, :])
    target = np.array(mini_df[columns])
                        

    model = Fedot(problem='regression', preset='light_tun', learning_time=13, composer_params=composer_params, seed=42)
    model.fit(features=features, target=target)
    pred = model.predict(features_test)
    predictions.append(pred.predict)

light_tun preset is used. Parameters tuning: True. Set of candidate models: ['linear', 'lasso', 'ridge', 'xgbreg', 'adareg', 'gbr', 'knnreg', 'dtreg', 'treg', 'rfr', 'svr', 'sgdr', 'direct_data_model', 'pca_data_model']. Composing time limit: 0:13:00
Model composition started
Metric evaluation error: y_true and y_pred have different number of output (7!=1)


KeyboardInterrupt: 

## MODELING, ENSEMBLING

### FEDOT

In [187]:
from fedot.api.main import Fedot

In [None]:
predictions = []
columns = ['discharge_mean', 'stage_avg_amplitude', 'stage_avg_mean', 'snow_coverage_station_amplitude',
           'snow_height_mean', 'snow_height_amplitude', 'precipitation_sum', 'water_hazard_sum']


composer_params = {'max_depth': 5,
                   'max_arity': 7,
                   'pop_size': 5,
                   'num_of_generations': 20,
                   'learning_time': 10,
                   'with_tuning': True}

model = Fedot(problem='regression', preset='light', learning_time=13, composer_params=composer_params, seed=42)
model.fit(features=new_df.drop(columns, axis=1), target=columns)
pred = model.predict()
predictions.append(pred.predict)

### LGBM

### LightAutoML

In [180]:
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.automl.blend import WeightedBlender
from lightautoml.ml_algo.boost_cb import BoostCB
from lightautoml.ml_algo.linear_sklearn import LinearLBFGS

from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task

In [181]:
task = Task('reg')
reader = PandasToPandasReader(task, cv=5, random_state=1)

model1 = BoostLGBM(default_params={'learning_rate': 0.1, 'num_leaves': 128, 'seed': 1, 'num_threads': 5})
params_tuner2 = OptunaTuner(n_trials=100, timeout=100)
model2 = BoostLGBM(default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 2, 'num_threads': 5})
gbm_0 = BoostCB()
gbm_1 = BoostCB()
tuner_0 = OptunaTuner(n_trials=100, timeout=100, fit_on_holdout=True)


pipeline_lvl1 = MLPipeline([model1, (model2, params_tuner2), (gbm_0, tuner_0), gbm_1])
reg_2 = LinearLBFGS()
pipeline_lvl2 = MLPipeline([reg_2])

predictions = []


timer = PipelineTimer(600, mode=2)
automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False, blender=WeightedBlender(), timer=timer)
pred = automl.fit_predict(new_df.drop(columns, axis=1), roles={'target': columns})
predictions.append(pred)


MultiOutputRegressor(estimator=<lightautoml.automl.base.AutoML object at 0x7f56a4d1a4c0>)