In [None]:
from lightgbm import LGBMRegressor
import lightgbm
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.linear_model import Ridge, LinearRegression
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os
from bayes_opt import BayesianOptimization
import pickle
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs
import json
from scipy.stats import pearsonr
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')

In [None]:
path_train = "data/train_data.csv"
path_test = "data/test_data.csv"
target_ = "contest-tmp2m-14d__tmp2m"

In [5]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

date_col = "startdate"

df_train[date_col] = pd.to_datetime(df_train[date_col])
df_test[date_col] = pd.to_datetime(df_test[date_col])

In [6]:
list_new_col = []

In [7]:
drop_col = pd.read_csv("data/correlations_with_target_greater_0.7.csv")
drop_col = drop_col["col"].values
drop_col = [each for each in drop_col if "contest" not in each and "wind" not in each]

bin_col = ['nmme-tmp2m-34w__nmmemean', 'nmme-tmp2m-56w__nmmemean', 'nmme0-tmp2m-34w__nmme0mean', 'nmme0mean']

def bin_feature_tpm2m(x):
    if x < -5:
        return 'A'
    elif x < 0:
        return 'B'
    elif x < 5:
        return 'C'
    elif x < 10:
        return 'D'
    elif x < 15:
        return 'E'
    elif x < 20:
        return 'F'
    elif x < 25:
        return 'G'
    else:
        return 'H'


In [8]:
for each in bin_col:
    df_train[each + "_bin"] = np.vectorize(bin_feature_tpm2m)(df_train[each])
    df_test[each + "_bin"] = np.vectorize(bin_feature_tpm2m)(df_test[each])
    list_new_col.append(each + "_bin")

In [9]:
col = [each for each in df_train.columns if "contest" in each and each != target_]
def get_idx(lat, lon):
    return str(round(lat, 4)) + "_" + str(round(lon, 4))
df_train['idx'] = np.vectorize(get_idx)(df_train['lat'], df_train['lon'])
df_test['idx'] = np.vectorize(get_idx)(df_test['lat'], df_test['lon'])

In [10]:
for each in col:
    df_train[each + "_lag_1"] = df_train.groupby("idx")[each].shift(1).bfill()
    df_test[each + "_lag_1"] = df_test.groupby("idx")[each].shift(1).bfill()
    df_train[each + "_lag_2"] = df_train.groupby("idx")[each].shift(2).bfill()
    df_test[each + "_lag_2"] = df_test.groupby("idx")[each].shift(2).bfill()
    list_new_col.append(each + "_lag_1")
    list_new_col.append(each + "_lag_2")
    # df_train[each + "_shift_1"] = df_train.groupby("idx")[each].shift(-1).ffill()
    # df_test[each + "_shitf_1"] = df_test.groupby("idx")[each].shift(-1).ffill()

In [11]:
col_test = [each for each in df_train.columns if "contest" not in each and ("wind" in each or "sst" in each)]
col_drift = ["wind-vwnd-250-2010-2", "wind-vwnd-250-2010-16", "wind-vwnd-250-2010-19", "wind-uwnd-250-2010-9", "wind-uwnd-250-2010-11",
             "wind-uwnd-250-2010-17", "wind-hgt-850-2010-2", "wind-hgt-850-2010-5", "wind-hgt-850-2010-7", "sst-2010-2", "sst-2010-10",
             "wind-hgt-500-2010-5", "wind-hgt-500-2010-7", "wind-hgt-500-2010-8", "wind-uwnd-925-2010-2", "wind-uwnd-925-2010-7",
             "wind-uwnd-925-2010-20"]
dict_col_drift = {each:True for each in col_drift}

for each in col_test:
    if each in dict_col_drift:
        continue
    df_train[each + "_lag_1"] = df_train.groupby("idx")[each].shift(1).bfill()
    df_test[each + "_lag_1"] = df_test.groupby("idx")[each].shift(1).bfill()
    df_train[each + "_diff"] = df_train.groupby("idx")[each].diff().bfill()
    df_test[each + "_diff"] = df_test.groupby("idx")[each].diff().bfill()
    list_new_col.append(each + "_diff")
    list_new_col.append(each + "_lag_1")
    
    

In [12]:
def create_mean_group(df, cols_group, cols_mean, col_date="startdate"):
    list_new_col = []
    df[col_date] = df[col_date].astype("str")
    for col_mean in cols_mean:
        for col_group in cols_group:
            new_col = col_mean + "_{}".format(col_group) + "_mean"
            dict_map = df.groupby([col_group, col_date])[col_mean].mean().to_dict()
            # print(dict_map)
            def map_dict(gr, date):
                return dict_map[gr, date]
            
            df[new_col] = np.vectorize(map_dict)(df[col_group], df[col_date])
            list_new_col.append(new_col)
            
    df[col_date] = pd.to_datetime(df[col_date])
    return df, list_new_col

cols_group = ["lat", "lon", "climateregions__climateregion"]
df_train, list_col = create_mean_group(df_train, cols_group, col)
df_test, _ = create_mean_group(df_test, cols_group, col)
list_new_col = [*list_new_col, *list_col]

In [13]:
def get_name_cols(suffix='', cols=[]):
    name_cols = [col + f'{suffix}' for col in cols if col != 'contest-tmp2m-14d__tmp2m']

    return name_cols

def feature_engineer(df, cols):

    
    # mean_contest_cols = df.groupby(['idx', 'month', 'year'])[cols].transform(lambda x: x.mean()) \
    #     .rename(columns=dict(zip(cols, get_name_cols('_mean', cols))))
    # std_contest_cols = df.groupby(['idx', 'month', 'year'])[cols].transform(lambda x: x.std()) \
    #     .rename(columns=dict(zip(cols, get_name_cols('_std', cols))))
    rolling_mean_contest_cols = df.groupby('idx')[cols].transform(lambda x: x.rolling(window=3, min_periods=2).mean()) \
        .rename(columns=dict(zip(cols, get_name_cols('_rolling_mean', cols))))
    rolling_std_contest_cols = df.groupby('idx')[cols].transform(lambda x: x.rolling(window=3, min_periods=2).std()) \
        .rename(columns=dict(zip(cols, get_name_cols('_rolling_std', cols))))
    # expanding_mean_contest_cols = df.groupby('idx')[cols].transform(lambda x: x.expanding(2).mean()) \
    #     .rename(columns=dict(zip(cols, get_name_cols('_expanding_mean', cols))))
    # expanding_std_contest_cols = df.groupby('idx')[cols].transform(lambda x: x.expanding(2).std()) \
    #     .rename(columns=dict(zip(cols, get_name_cols('_expanding_std', cols))))
    # diff_contest_cols = df.groupby('idx')[contest_cols].transform(lambda x: x.diff()) \
    #     .rename(columns=dict(zip(cols, get_name_cols('_diff'))))
    exponentially_weighted_average = df.groupby('idx')[cols].transform(lambda x: x.shift(1).ewm(alpha=0.95).mean()) \
        .rename(columns=dict(zip(cols, get_name_cols('_ewa', cols))))

    df = pd.concat([df, rolling_mean_contest_cols, rolling_std_contest_cols, exponentially_weighted_average], axis=1)
    df = df.fillna(method='bfill')
    
    return df

In [14]:
imp_contest_cols = [
    'contest-pevpr-sfc-gauss-14d__pevpr',
    'contest-pres-sfc-gauss-14d__pres',
    'contest-prwtr-eatm-14d__prwtr',
    'contest-slp-14d__slp',
    'contest-wind-h10-14d__wind-hgt-10',
    'contest-wind-h100-14d__wind-hgt-100',
    'contest-wind-h500-14d__wind-hgt-500',
    'contest-wind-uwnd-250-14d__wind-uwnd-250',
    'contest-wind-vwnd-925-14d__wind-vwnd-925'
]

df_train = feature_engineer(df_train, imp_contest_cols)
df_test = feature_engineer(df_test, imp_contest_cols)

In [15]:
col_ = [each for each in df_train.columns if "_rolling" in each or "_ewa" in each]
list_new_col = [*list_new_col, *col_]

In [16]:
for each in list_new_col:
    try: 
        corr, _ = pearsonr(df_train[each], df_train[target_])
        print(f"{each}: {round(corr, 5)}")
    except:
        print(f"Cate col: {each}")

Cate col: nmme-tmp2m-34w__nmmemean_bin
Cate col: nmme-tmp2m-56w__nmmemean_bin
Cate col: nmme0-tmp2m-34w__nmme0mean_bin
Cate col: nmme0mean_bin
contest-pevpr-sfc-gauss-14d__pevpr_lag_1: 0.80444
contest-pevpr-sfc-gauss-14d__pevpr_lag_2: 0.80296
contest-wind-h10-14d__wind-hgt-10_lag_1: 0.76435
contest-wind-h10-14d__wind-hgt-10_lag_2: 0.76479
contest-rhum-sig995-14d__rhum_lag_1: -0.56272
contest-rhum-sig995-14d__rhum_lag_2: -0.55949
contest-wind-h100-14d__wind-hgt-100_lag_1: 0.89578
contest-wind-h100-14d__wind-hgt-100_lag_2: 0.89251
contest-slp-14d__slp_lag_1: -0.70183
contest-slp-14d__slp_lag_2: -0.695
contest-wind-vwnd-925-14d__wind-vwnd-925_lag_1: 0.27221
contest-wind-vwnd-925-14d__wind-vwnd-925_lag_2: 0.26921
contest-pres-sfc-gauss-14d__pres_lag_1: 0.24456
contest-pres-sfc-gauss-14d__pres_lag_2: 0.2448
contest-wind-uwnd-250-14d__wind-uwnd-250_lag_1: -0.33044
contest-wind-uwnd-250-14d__wind-uwnd-250_lag_2: -0.33319
contest-prwtr-eatm-14d__prwtr_lag_1: 0.77026
contest-prwtr-eatm-14d__prw

In [17]:
df_train = df_train.drop(columns=["idx"])
df_test = df_test.drop(columns=["idx"])

# Plot dis

In [66]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()
df_train_copy["type_data"] = ["train" for i in range(len(df_train))]
df_test_copy["type_data"] = ["test" for i in range(len(df_test))]

df_total = pd.concat([df_train_copy, df_test_copy]).reset_index(drop=True)

In [None]:
n_cols = 4
n_rows = len(list_new_col) // n_cols + (len(list_new_col) % n_cols != 0)
f, axes = plt.subplots(n_rows, n_cols, figsize=(20, 250))
count_cols, count_rows = 0, 0

# df_sample = df_total[df_total["is_driff"] == 0].reset_index(drop=True)
# df_test_non_driff = df_test[df_test["is_driff"] == 0].reset_index(drop=True)
for each in list_new_col:
    try:
        sns.kdeplot(x=each, data=df_total, hue='type_data', common_norm=False, ax=axes[count_rows][count_cols])
        count_cols += 1
        count_rows += 1 if count_cols % n_cols == 0 else 0
        count_cols = count_cols % n_cols
    except:
        pass
f.tight_layout(h_pad=6)

# Tuning hyperparameters

In [48]:
def bayes_tunning():
    def CB_opt(max_depth, num_leaves, colsample_bytree, reg_alpha, reg_lambda, subsample, feature_fraction_bynode):
        
        bin_col = ['nmme-tmp2m-34w__nmmemean', 'nmme-tmp2m-56w__nmmemean', 'nmme0-tmp2m-34w__nmme0mean', 'nmme0mean']
        bin_col = [each + "_bin" for each in bin_col]
        cat_ft = [*bin_col, *["climateregions__climateregion", "idx", "mjo1d__phase"]]
        # cat_ft = ["climateregions__climateregion", "idx", "mjo1d__phase"]
        
        lgb = LGBMRegressor(metric="rmse", boosting_type="dart", 
                            max_depth = round(max_depth), n_estimators = 7999, 
                            subsample=subsample, colsample_bytree=colsample_bytree, 
                            reg_alpha=reg_alpha, reg_lambda=reg_lambda,
                            feature_fraction_bynode=feature_fraction_bynode,
                            num_leaves=round(num_leaves), verbose=0)
        
        lgb.fit(X_train, y_train, eval_metric="rmse", categorical_feature=cat_ft)
        ypred_valid = lgb.predict(X_valid)

        return 1 / mean_squared_error(y_valid, ypred_valid, squared=False)

    pbounds = {
        "max_depth": (9, 17),
        'num_leaves': (14, 30),
        'colsample_bytree': (0.45, 0.75),
        'reg_alpha': (0.1, 5.2), 
        'reg_lambda': (0.1, 3.2),
        'subsample': (0.65, 0.85),
        'feature_fraction_bynode': (0.65, 0.9)
    }
    
    path_tunning = "Params_tunning/Log/logs_dart_6_new_fe_ver1.json".format(index)
    optimizer = BayesianOptimization(f = CB_opt, pbounds = pbounds, verbose = 2)
    # if os.path.exists(path_tunning) is False:
    #     logger = JSONLogger(path=path_tunning, reset=False)
    #     optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)
    #     optimizer.maximize(init_points = 1, n_iter = 4)
    # else:
    #     load_logs(optimizer, logs=[path_tunning])
    logger = JSONLogger(path=path_tunning, reset=False)
    optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)
    optimizer.maximize(init_points = 6, n_iter = 27)    
    print(optimizer.max['target'])

    max_bo_params = optimizer.max['params']

    return max_bo_params

In [49]:
dict_result = [6]

In [None]:
year_valid = [2014, 2014, 2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016]
month_valid = [9, 11, 1, 3, 5, 7, 9, 11, 1, 3, 5, 7]
gap = 1
for index in range(len(year_valid)):
    if index not in dict_result:
        continue
    #thay code   
    m_s, m_e = month_valid[index], month_valid[index] + gap
    y_s, y_e = year_valid[index], year_valid[index]
    y_e = y_e + 1 if m_e > 12 else y_e
    m_e = m_e if m_e <= 12 else ((m_e - 12) % 12)
    if y_e != y_s:
        X_valid = df_train[((df_train[date_col].dt.year == y_s) & (df_train[date_col].dt.month >= m_s)) | ((df_train[date_col].dt.month <= m_e) & (df_train[date_col].dt.month == y_e))]
        X_train = df_train[~(((df_train[date_col].dt.year == y_s) & (df_train[date_col].dt.month >= m_s)) | ((df_train[date_col].dt.month <= m_e) & (df_train[date_col].dt.month == y_e)))]
    else:
        X_valid = df_train[(df_train[date_col].dt.year == y_s) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
        X_train = df_train[~((df_train[date_col].dt.year == y_s) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]
    #drop hết cột mới
    
    y_train = X_train[target_]
    X_train = X_train.drop(columns=["index", target_])
    y_valid = X_valid[target_]
    X_valid = X_valid.drop(columns=["index", target_])
    
    test_index = df_test["index"].values
    X_test = df_test.drop(columns=["index"]).copy()

    cat_cols = [i for i in X_train.select_dtypes(include='object').columns if i != date_col]
    X_train, X, listEncoder = handle_data.handle_feature_train_data(X_train, date_col, cat_cols.copy(), norm="none")
    X_valid = handle_data.handle_feature_test_data(X_valid, listEncoder, columns_cat=cat_cols.copy()) 
    X_test = handle_data.handle_feature_test_data(X_test, listEncoder, columns_cat=cat_cols.copy())
    
    drop_mei = [each for each in X_train.columns if "mei" in each]
    drop_ = [*drop_col, *["month", "day_of_year", "day_of_year_sin", "day_of_year_cos", "month_sin", "month_cos"], *drop_mei, *col_drift]
    
    X_train = X_train.drop(columns=drop_)
    X_valid = X_valid.drop(columns=drop_)
    X_test = X_test.drop(columns=drop_)
    
    print("Training - {}".format(index))
    t= time.time()
    max_bo_params = bayes_tunning()
    with open("Params_tunning/dict_dart_{}_new_fe.pickle".format(index), "wb") as file:
        pickle.dump(max_bo_params, file)
     
    print("Take time: ", time.time() - t)
    print("-------------")

Training - 6
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
Y

In [27]:
dict_result = {}

# Run model

In [19]:
params = {"target": 1.6328136155725321, "params": {"colsample_bytree": 0.75, "feature_fraction_bynode": 0.9, "max_depth": 11.186366350741467, "num_leaves": 14.0, "reg_alpha": 3.2, "reg_lambda": 1.2, "subsample": 0.85}, "datetime": {"datetime": "2023-02-21 04:22:21", "elapsed": 20150.435182, "delta": 1645.855422}}
params = params["params"]

In [20]:
params

{'colsample_bytree': 0.75,
 'feature_fraction_bynode': 0.65,
 'max_depth': 9.0,
 'num_leaves': 14.0,
 'reg_alpha': 5.2,
 'reg_lambda': 0.1,
 'subsample': 0.85}

In [23]:
year_valid = [2014, 2014, 2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016]
month_valid = [9, 11, 1, 3, 5, 7, 9, 11, 1, 3, 5, 7]
gap = 1
for index in range(len(year_valid)):
    if index in dict_result:
        continue
    m_s, m_e = month_valid[index], month_valid[index] + gap
    y_s, y_e = year_valid[index], year_valid[index]
    y_e = y_e + 1 if m_e > 12 else y_e
    m_e = m_e if m_e <= 12 else ((m_e - 12) % 12)
    if y_e != y_s:
        X_valid = df_train[((df_train[date_col].dt.year == y_s) & (df_train[date_col].dt.month >= m_s)) | ((df_train[date_col].dt.month <= m_e) & (df_train[date_col].dt.month == y_e))]
        X_train = df_train[~(((df_train[date_col].dt.year == y_s) & (df_train[date_col].dt.month >= m_s)) | ((df_train[date_col].dt.month <= m_e) & (df_train[date_col].dt.month == y_e)))]
    else:
        X_valid = df_train[(df_train[date_col].dt.year == y_s) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
        X_train = df_train[~((df_train[date_col].dt.year == y_s) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]
    
    y_train = X_train[target_]
    X_train = X_train.drop(columns=["index", target_])
    y_valid = X_valid[target_]
    X_valid = X_valid.drop(columns=["index", target_])
    
    test_index = df_test["index"].values
    X_test = df_test.drop(columns=["index"]).copy()

    cat_cols = [i for i in X_train.select_dtypes(include='object').columns if i != date_col]
    X_train, X, listEncoder = handle_data.handle_feature_train_data(X_train, date_col, cat_cols.copy(), norm="none")
    X_valid = handle_data.handle_feature_test_data(X_valid, listEncoder, columns_cat=cat_cols.copy()) 
    X_test = handle_data.handle_feature_test_data(X_test, listEncoder, columns_cat=cat_cols.copy())
    # X_sm = handle_data.handle_feature_test_data(X_oversampling_smote.copy(), listEncoder, df_mean, columns_cat=cat_cols.copy(), column_date=None)
    
    # y_sm = X_sm[target_]
    # X_sm = X_sm.drop(columns=[*["mei__nip", target_], *bin_col])
    
    drop_mei = [each for each in X_train.columns if "mei" in each]
    drop_ = [*drop_col, *["month", "day_of_year", "day_of_year_sin", "day_of_year_cos", "month_sin", "month_cos"], *drop_mei, *col_drift]
    
    # X_train_1 = X_train.drop(columns=drop_1)
    # X_valid_1 = X_valid.drop(columns=drop_1)
    # X_test_1 = X_test.drop(columns=drop_1)
    
    X_train = X_train.drop(columns=drop_)
    X_valid = X_valid.drop(columns=drop_)
    X_test = X_test.drop(columns=drop_)
    
    # X_train = pd.concat([X_train, X_sm])
    # y_train = [*y_train, *y_sm]
    cat_ft = [*cat_cols, *['idx', "mjo1d__phase"]]

        
    print("Training - {}".format(index))
    t= time.time()
    # lgb = LGBMRegressor(metric="rmse", boosting_type="dart", max_depth = 16, n_estimators = 8999, subsample=0.75, colsample_bytree=0.75, 
    #                     reg_alpha=0.5, reg_lambda=0.5, verbose=0)
    # lgb.fit(X_train, y_train, eval_metric="rmse", categorical_feature=["climateregions__climateregion", "idx"])
    # lgb = LGBMRegressor(max_depth = 15, n_estimators = 4999, subsample=0.75, colsample_bytree=0.75, verbose=0)
     
    # with open("Params_tunning/dict_dart_{}_col.pickle".format(index), "rb") as file:
    #     params = pickle.load(file)
   
    if index == 0:
        print(cat_ft)
    lgb = LGBMRegressor(metric="rmse", boosting_type="dart", max_depth = round(params["max_depth"]), n_estimators = 7999, 
                        subsample=params["subsample"], colsample_bytree=params["colsample_bytree"], num_leaves=round(params["num_leaves"]),
                        reg_alpha=params["reg_alpha"], reg_lambda=params["reg_lambda"], feature_fraction_bynode=params["feature_fraction_bynode"],
                        verbose=0)
    # lgb_1 = LGBMRegressor(max_depth = 15, n_estimators = 1999, subsample=0.75, colsample_bytree=0.75, verbose=0) 
    
    # lgb = LGBMRegressor(metric="rmse", boosting_type="dart", max_depth = 16, n_estimators = 7999, subsample=0.7, colsample_bytree=0.7, 
    #                      reg_alpha=0.3, reg_lambda=0.1, verbose=0)
    lgb.fit(X_train, y_train, eval_metric="rmse", categorical_feature=cat_ft)
    lgb.booster_.save_model('Model/model_dart_{}_new_col.txt'.format(index))
    # lgb_1.fit(X_train_1, y_train, eval_metric="rmse", categorical_feature=["climateregions__climateregion", "idx"])
    
    result_train = mean_squared_error(y_train, lgb.predict(X_train), squared=False)
    result_valid = mean_squared_error(y_valid, lgb.predict(X_valid), squared=False)    
    
    # result_train_1 = mean_squared_error(y_train, lgb_1.predict(X_train_1), squared=False)
    # result_valid_1 = mean_squared_error(y_valid, lgb_1.predict(X_valid_1), squared=False)  
    
    ypred_test = lgb.predict(X_test) 
    # ypred_test_1 = lgb_1.predict(X_test_1)
    
    dict_result[index] = ypred_test
    # dict_result_1[index] = ypred_test_1
    
    print("Take time: ", time.time() - t)
    print("Train_score: {}  Valid_score: {}".format(result_train, result_valid))
    # print("Train_score_: {}  Valid_score_: {}".format(result_train_1, result_valid_1))
    print("-------------")

Training - 7
You can set `force_col_wise=true` to remove the overhead.
Take time:  1703.636391878128
Train_score: 0.20278565018760022  Valid_score: 1.3858856163857631
-------------
Training - 8
You can set `force_col_wise=true` to remove the overhead.
Take time:  1762.5024526119232
Train_score: 0.20049359313636209  Valid_score: 1.3339288349648584
-------------
Training - 9
You can set `force_col_wise=true` to remove the overhead.
Take time:  1724.1370990276337
Train_score: 0.2042259694886196  Valid_score: 1.0678762131464805
-------------
Training - 10
You can set `force_col_wise=true` to remove the overhead.
Take time:  1739.1508567333221
Train_score: 0.20709063237992448  Valid_score: 0.6714003834891756
-------------
Training - 11
You can set `force_col_wise=true` to remove the overhead.
Take time:  1905.7250380516052
Train_score: 0.2054750504896315  Valid_score: 0.6104590195280357
-------------


In [42]:
result_valid

0.6683940744711073

In [24]:
dict_result

{6: True,
 0: array([29.28658974, 29.31585449, 29.34350867, ...,  6.11133959,
         6.24849004,  6.4601141 ]),
 1: array([29.28316238, 29.25429377, 29.35201372, ...,  5.97778178,
         5.9121879 ,  6.26908729]),
 2: array([29.35235617, 29.34094262, 29.25909558, ...,  5.47735864,
         5.63094354,  5.84737485]),
 3: array([29.36262103, 29.33616504, 29.35179165, ...,  5.73280944,
         5.59667642,  5.83948443]),
 4: array([29.36797508, 29.3537477 , 29.32892286, ...,  5.31669253,
         5.2358517 ,  5.76573137]),
 5: array([29.24411431, 29.27578715, 29.28016752, ...,  5.59031142,
         5.77926369,  5.95384459]),
 7: array([29.41356579, 29.48854148, 29.43402263, ...,  5.41660592,
         5.60725419,  5.89594262]),
 8: array([29.30903829, 29.31070231, 29.31781805, ...,  5.48192629,
         5.62803973,  5.99663347]),
 9: array([29.39679769, 29.4249583 , 29.45106257, ...,  6.19095759,
         6.25731831,  6.4750638 ]),
 10: array([29.38822738, 29.34749864, 29.31376684, ...

In [26]:
ypred_test = np.mean([value for key, value in dict_result.items() if key == 11], axis=0)
# ypred_test_early = np.mean([value for key, value in dict_result_early.items()], axis=0)
pd.DataFrame(data = {"{}".format(target_): ypred_test, "index": test_index}).to_csv("Data/submit_tunning.csv", index=False)

In [19]:
mean_squared_error(y_valid, lgb.predict(X_valid), squared=False)

0.6302122355070707

In [50]:
dict(zip(lgb.feature_name_, lgb.feature_importances_))

{'lat': 561,
 'lon': 521,
 'contest-pevpr-sfc-gauss-14d__pevpr': 4463,
 'contest-wind-h10-14d__wind-hgt-10': 212,
 'contest-rhum-sig995-14d__rhum': 340,
 'nmme-prate-34w__cancm3': 154,
 'nmme-prate-34w__cancm4': 127,
 'nmme-prate-34w__ccsm3': 195,
 'nmme-prate-34w__ccsm4': 162,
 'nmme-prate-34w__cfsv2': 174,
 'nmme-prate-34w__gfdl': 543,
 'nmme-prate-34w__gfdlflora': 134,
 'nmme-prate-34w__gfdlflorb': 241,
 'nmme-prate-34w__nasa': 152,
 'nmme-prate-34w__nmmemean': 65,
 'contest-wind-h100-14d__wind-hgt-100': 2652,
 'nmme0-prate-56w__cancm30': 155,
 'nmme0-prate-56w__cancm40': 173,
 'nmme0-prate-56w__ccsm30': 400,
 'nmme0-prate-56w__ccsm40': 99,
 'nmme0-prate-56w__cfsv20': 136,
 'nmme0-prate-56w__gfdlflora0': 105,
 'nmme0-prate-56w__gfdlflorb0': 120,
 'nmme0-prate-56w__gfdl0': 103,
 'nmme0-prate-56w__nasa0': 98,
 'nmme0-prate-56w__nmme0mean': 92,
 'nmme0-prate-34w__cancm30': 349,
 'nmme0-prate-34w__cancm40': 234,
 'nmme0-prate-34w__ccsm30': 294,
 'nmme0-prate-34w__ccsm40': 87,
 'nmme0-pr

In [51]:
# lgb_ = lightgbm.Booster(model_file="Model/model_dart_fe_6.txt")
dict_ = dict(zip(lgb.feature_name_, lgb.feature_importances_))
sorted(dict_.items(), key=lambda x: x[1], reverse=True), len(dict_)

([('idx', 35137),
  ('contest-wind-h500-14d__wind-hgt-500', 8391),
  ('contest-slp-14d__slp', 5067),
  ('nmme-tmp2m-56w__nmmemean_bin', 4611),
  ('contest-pevpr-sfc-gauss-14d__pevpr', 4463),
  ('contest-prwtr-eatm-14d__prwtr', 3993),
  ('nmme-tmp2m-34w__nmmemean_bin', 3747),
  ('contest-pevpr-sfc-gauss-14d__pevpr_lat_mean', 3642),
  ('contest-wind-h100-14d__wind-hgt-100', 2652),
  ('climateregions__climateregion', 2455),
  ('contest-prwtr-eatm-14d__prwtr_rolling_mean', 2112),
  ('contest-pevpr-sfc-gauss-14d__pevpr_climateregions__climateregion_mean',
   1814),
  ('contest-prwtr-eatm-14d__prwtr_lat_mean', 1710),
  ('contest-wind-h500-14d__wind-hgt-500_rolling_mean', 1582),
  ('elevation__elevation', 1448),
  ('contest-prwtr-eatm-14d__prwtr_lag_2', 1335),
  ('contest-prwtr-eatm-14d__prwtr_climateregions__climateregion_mean', 1292),
  ('contest-pevpr-sfc-gauss-14d__pevpr_rolling_mean', 1199),
  ('contest-wind-h500-14d__wind-hgt-500_climateregions__climateregion_mean',
   1189),
  ('contes

In [1]:
check = \
{'colsample_bytree': 0.85,
 'max_depth': 10,
 'min_split_gain': 0.001,
 'num_leaves': 21.46,
 'reg_alpha': 2.08,
 'reg_lambda': 0.1,
 'subsample': 0.85}

In [None]:
with open("Params_tunning/dict_8.pickle".format(index), "wb") as file:
    check = pickle.load(file)

In [13]:
lgb = lightgbm.Booster(model_file='Model/model_4.txt')

In [76]:
lgb.feature_importance_

AttributeError: 'LGBMRegressor' object has no attribute 'feature_importance_'

In [17]:
lgb.feature_name()

['lat',
 'lon',
 'contest-pevpr-sfc-gauss-14d__pevpr',
 'contest-wind-h10-14d__wind-hgt-10',
 'contest-rhum-sig995-14d__rhum',
 'nmme-prate-34w__cancm3',
 'nmme-prate-34w__cancm4',
 'nmme-prate-34w__ccsm3',
 'nmme-prate-34w__ccsm4',
 'nmme-prate-34w__cfsv2',
 'nmme-prate-34w__gfdl',
 'nmme-prate-34w__gfdlflora',
 'nmme-prate-34w__gfdlflorb',
 'nmme-prate-34w__nasa',
 'nmme-prate-34w__nmmemean',
 'contest-wind-h100-14d__wind-hgt-100',
 'nmme0-prate-56w__cancm30',
 'nmme0-prate-56w__cancm40',
 'nmme0-prate-56w__ccsm30',
 'nmme0-prate-56w__ccsm40',
 'nmme0-prate-56w__cfsv20',
 'nmme0-prate-56w__gfdlflora0',
 'nmme0-prate-56w__gfdlflorb0',
 'nmme0-prate-56w__gfdl0',
 'nmme0-prate-56w__nasa0',
 'nmme0-prate-56w__nmme0mean',
 'nmme0-prate-34w__cancm30',
 'nmme0-prate-34w__cancm40',
 'nmme0-prate-34w__ccsm30',
 'nmme0-prate-34w__ccsm40',
 'nmme0-prate-34w__cfsv20',
 'nmme0-prate-34w__gfdlflora0',
 'nmme0-prate-34w__gfdlflorb0',
 'nmme0-prate-34w__gfdl0',
 'nmme0-prate-34w__nasa0',
 'nmme0-pra

In [40]:
a = pd.DataFrame(data=np.stack((lgb.feature_name(), lgb.feature_importance().tolist()), axis=1), columns=['feature_name', 'score'])
a['score'] = a['score'].astype('int64')
a.sort_values(by=['score'], ascending=False).head(30)

Unnamed: 0,feature_name,score
202,idx,77689
55,contest-wind-h500-14d__wind-hgt-500,18474
50,contest-prwtr-eatm-14d__prwtr,15279
36,contest-slp-14d__slp,13168
15,contest-wind-h100-14d__wind-hgt-100,12305
2,contest-pevpr-sfc-gauss-14d__pevpr,11419
48,contest-pres-sfc-gauss-14d__pres,7654
56,climateregions__climateregion,6329
57,elevation__elevation,4921
162,wind-hgt-10-2010-1,4106


In [31]:
np.stack((lgb.feature_name(), lgb.feature_importance().tolist()), axis=1).shape

(203, 2)

In [118]:
dict_stat = {}
for each in range(0, 10):
    save_model = lightgbm.Booster(model_file="Model/model_{}.txt".format(each))
    dict_ = dict(zip(save_model.feature_name(), save_model.feature_importance()))
    for key, value in dict_.items():
        if key in col_test and value > 400:
            if key not in dict_stat:
                dict_stat[key] = 0
            dict_stat[key] += 1

In [119]:
len(dict_stat)

50

In [120]:
dict_stat

{'wind-vwnd-250-2010-1': 5,
 'wind-vwnd-250-2010-11': 10,
 'wind-vwnd-250-2010-13': 10,
 'wind-uwnd-250-2010-2': 8,
 'wind-uwnd-250-2010-3': 10,
 'wind-uwnd-250-2010-4': 10,
 'wind-uwnd-250-2010-10': 1,
 'wind-hgt-850-2010-1': 10,
 'sst-2010-1': 10,
 'sst-2010-2': 10,
 'sst-2010-4': 1,
 'sst-2010-5': 3,
 'wind-hgt-500-2010-1': 10,
 'icec-2010-1': 9,
 'icec-2010-2': 7,
 'icec-2010-4': 9,
 'icec-2010-7': 7,
 'icec-2010-8': 10,
 'wind-uwnd-925-2010-3': 10,
 'wind-uwnd-925-2010-14': 9,
 'wind-uwnd-925-2010-15': 1,
 'wind-hgt-10-2010-1': 10,
 'wind-hgt-10-2010-2': 10,
 'wind-hgt-10-2010-5': 4,
 'wind-hgt-100-2010-1': 10,
 'wind-hgt-100-2010-2': 10,
 'wind-hgt-100-2010-9': 2,
 'wind-vwnd-925-2010-1': 10,
 'wind-vwnd-925-2010-3': 8,
 'wind-vwnd-925-2010-6': 9,
 'wind-vwnd-925-2010-7': 10,
 'wind-uwnd-250-2010-15': 7,
 'wind-uwnd-250-2010-16': 6,
 'sst-2010-3': 3,
 'sst-2010-8': 5,
 'sst-2010-9': 9,
 'sst-2010-10': 3,
 'wind-uwnd-925-2010-13': 8,
 'wind-hgt-100-2010-7': 2,
 'wind-vwnd-925-2010