In [1]:
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
import lightgbm
# from boruta import BorutaPy
import pandas as pd
import numpy as np
# from fastai.tabular.core import df_shrink
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
import time
import matplotlib.pyplot as plt
import os

In [2]:
path_train = "data/train_data.csv"
path_test = "data/test_data.csv"
target_ = "contest-tmp2m-14d__tmp2m"

In [3]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def handle_idx_date(df, column_date, idx_name):
    def get_idx(lat, lon):
        return str(round(lat, 4)) + "_" + str(round(lon, 4))
    df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
    df[column_date] = pd.to_datetime(df[column_date])
    df['day_of_year'] = df[column_date].dt.day_of_year
    df['month'] = df[column_date].dt.month
      # encode the day with a period of 365
    df['day_of_year_sin'] = sin_transformer(365).fit_transform(df['day_of_year'])
    df['day_of_year_cos'] = cos_transformer(365).fit_transform(df['day_of_year'])

    # encode the month with a period of 12
    df['month_sin'] = sin_transformer(12).fit_transform(df['month'])
    df['month_cos'] = cos_transformer(12).fit_transform(df['month'])
    return df

def handle_feature_train_data(df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []

    for each in columns_cat:
        lbE = LabelEncoder().fit(df[each])
        df[each] = lbE.transform(df[each])
        list_lbEncoder.append(lbE)

    # fill nulls
    mean_df = df.mean()
    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")

    return df, list_lbEncoder, mean_df

def handle_feature_test_data(df, lbEncoder, mean_df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []

    for index, each in enumerate(columns_cat):
        df[each] = lbEncoder[index].transform(df[each])

    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")
    return df

In [4]:
def calculate_corr_target(X_train, X_val):
    data_col = []
    data_corr_train = []
    data_corr_val = []

    for col in X_train.columns:
        corr_train = X_train['contest-tmp2m-14d__tmp2m'].corr(X_train[col])
        corr_val = X_val['contest-tmp2m-14d__tmp2m'].corr(X_val[col])

        data_col.append(col)
        data_corr_train.append(corr_train)
        data_corr_val.append(corr_val)

    corr = pd.DataFrame(data={'col': data_col, 'corr_train': data_corr_train, 'corr_val': data_corr_val})

    return corr

In [5]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

date_col = "startdate"

df_train[date_col] = pd.to_datetime(df_train[date_col])
df_test[date_col] = pd.to_datetime(df_test[date_col])

In [6]:
corr_df = pd.read_csv("data/correlations_with_target.csv").drop(columns='Unnamed: 0')
drop_col = corr_df["col"].values
drop_col = [each for each in drop_col if "contest" not in each and "wind" not in each]

In [12]:
year_valid = [2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016]
month_valid = [1, 3, 5, 7, 9, 11, 1, 3, 5, 7]
dict_result = {}
for index in range(len(year_valid)):
    if index in dict_result:
        continue
    y, m_s, m_e = year_valid[index], month_valid[index], month_valid[index] + 1
    X_valid = df_train[(df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
    X_train = df_train[~((df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]
    len_train = len(X_train)

    y_train = X_train[target_]
    X_train = X_train.drop(columns=[target_])
    y_valid = X_valid[target_]
    X_valid = X_valid.drop(columns=[target_])

    cat_cols = [i for i in X_train.select_dtypes(include='object').columns if i != date_col]
    X_train, listEncoder, df_mean = handle_feature_train_data(X_train, date_col, cat_cols.copy())
    X_valid = handle_feature_test_data(X_valid, listEncoder, df_mean, columns_cat=cat_cols.copy())
    # X_test = handle_feature_test_data(X_test, listEncoder, df_mean, columns_cat=cat_cols.copy())

    drop_ = [*drop_col, *["month", "day_of_year", "day_of_year_sin", "day_of_year_cos", "month_sin", "month_cos"]]
    # X_train = X_train.drop(columns=drop_)
    # X_valid = X_valid.drop(columns=drop_)
    # X_test = X_test.drop(columns=drop_)

    X_final = pd.concat([X_train, X_valid]).reset_index(drop=True)
    y_final = np.concatenate((y_train, y_valid))
    train_idx = X_final.loc[:len_train].index.tolist()
    valid_idx = X_final.loc[len_train:].index.tolist()

    print("Training - {}".format(index))
    t= time.time()
    lgb = LGBMRegressor(metric="rmse", max_depth = 15, n_estimators = 2999, subsample=0.7, colsample_bytree=0.7, verbose=0)
    # selector = RFECV(lgb, step=5, cv=[(train_idx, valid_idx)], min_features_to_select=150, scoring="neg_root_mean_squared_error")
    selector = SequentialFeatureSelector(lgb, cv=[(train_idx, valid_idx)], direction="backward", n_features_to_select=150, scoring="neg_root_mean_squared_error")
    selector = selector.fit(X_final, y_final)
    dict_result[index] = selector
    print("Take time: ", time.time() - t)
    print("-------------")
    break

Training - 0
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set

KeyboardInterrupt: 

(375734, 136)

In [87]:
# dict_result = {}
#
# year_valid = [2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016]
# month_valid = [1, 3, 5, 7, 9, 11, 1, 3, 5, 7]
#
# for index in range(len(year_valid)):
#     if index in dict_result:
#         continue
#     y, m_s, m_e = year_valid[index], month_valid[index], month_valid[index] + 1
#
#     print("Training - {}".format(index))
#     print(f'Test on month {m_s} and {m_e}')
#
#     X_valid = df_train[(df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
#     X_train = df_train[~((df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]
#
#     y_train = X_train[target_]
#     # X_train = X_train.drop(columns=target_)
#     y_valid = X_valid[target_]
#     # X_valid = X_valid.drop(columns=target_)
#
#     test_index = df_test["index"].values
#     X_test = df_test.copy()
#
#     cat_cols = [i for i in X_train.select_dtypes(include='object').columns if i != date_col]
#     X_train, listEncoder, df_mean = handle_feature_train_data(X_train, date_col, cat_cols.copy())
#     X_valid = handle_feature_test_data(X_valid, listEncoder, df_mean, columns_cat=cat_cols.copy())
#     X_test = handle_feature_test_data(X_test, listEncoder, df_mean, columns_cat=cat_cols.copy())
#
#     drop_ = [*drop_col, *["month", "day_of_year", "day_of_year_sin", "day_of_year_cos", "month_sin", "month_cos"]]
#     # X_train = X_train.drop(columns=drop_)
#     # X_valid = X_valid.drop(columns=drop_)
#     # X_test = X_test.drop(columns=drop_)
#
#     print(f'{time.time()}: Calculating correlations...')
#     corr_with_target = calculate_corr_target(X_train, X_valid)
#     print(f'{time.time()}: Calculated!')
#
#     X_train = X_train.drop(columns=target_)
#     X_valid = X_valid.drop(columns=target_)
#
#     t = time.time()
#     lgb = LGBMRegressor(max_depth=15, n_estimators=4999, subsample=0.75, colsample_bytree=0.75, verbose=0, n_jobs=-1)
#     lgb.fit(X_train, y_train, eval_metric="rmse")
#     # lgb = CatBoostRegressor(verbose=200, cat_features=["climateregions__climateregion", "idx"])
#     # lgb.fit(X_train, y_train)
#
#     # feature importance
#     importances = lgb.feature_importances_
#     data = {'col': X_train.columns, 'imp': importances}
#     ft_imp_df = pd.DataFrame(data)
#     ft_imp_corr_df = pd.merge(ft_imp_df, corr_with_target, on='col', how='left')
#     ft_imp_corr_df.to_csv(f'result_2/ft_imp_corr_{index}.csv', index=False)
#     print(f'{time.time()}: Feature importance and correlations results are saved!')
#
#     result_train = mean_squared_error(y_train, lgb.predict(X_train), squared=False)
#     result_valid = mean_squared_error(y_valid, lgb.predict(X_valid), squared=False)
#     ypred_test = lgb.predict(X_test)
#     dict_result[index] = ypred_test
#     print("Take time: ", time.time() - t)
#     print("Train_score: {}  Valid_score: {}".format(result_train, result_valid))
#     print("-------------")

Training - 0
Test on month 1 and 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674635451.622077: Calculating correlations...
1674635452.4085288: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674635606.5863261: Feature importance and correlations results are saved!
Take time:  185.25930881500244
Train_score: 0.11451156242676554  Valid_score: 2.112103853404661
-------------
Training - 1
Test on month 3 and 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674635653.8154922: Calculating correlations...
1674635654.63269: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674635808.6141021: Feature importance and correlations results are saved!
Take time:  185.35418009757996
Train_score: 0.11894537870937816  Valid_score: 1.7367044447260191
-------------
Training - 2
Test on month 5 and 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674635859.013603: Calculating correlations...
1674635859.759011: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674636013.026348: Feature importance and correlations results are saved!
Take time:  184.4722249507904
Train_score: 0.11846630556316702  Valid_score: 1.4356677749129527
-------------
Training - 3
Test on month 7 and 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674636062.244896: Calculating correlations...
1674636062.937873: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674636227.347255: Feature importance and correlations results are saved!
Take time:  199.74548721313477
Train_score: 0.12149117392163393  Valid_score: 0.9817175380050631
-------------
Training - 4
Test on month 9 and 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674636281.6059031: Calculating correlations...
1674636282.36499: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674636441.359693: Feature importance and correlations results are saved!
Take time:  194.7338900566101
Train_score: 0.12053826381281423  Valid_score: 1.1226059532856862
-------------
Training - 5
Test on month 11 and 12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674636496.737704: Calculating correlations...
1674636497.547727: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674636661.0191948: Feature importance and correlations results are saved!
Take time:  196.76520085334778
Train_score: 0.1133938925655624  Valid_score: 2.0570541442106043
-------------
Training - 6
Test on month 1 and 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674636713.198588: Calculating correlations...
1674636714.005283: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674636867.813909: Feature importance and correlations results are saved!
Take time:  184.40539574623108
Train_score: 0.11473893492446309  Valid_score: 1.9286350197573239
-------------
Training - 7
Test on month 3 and 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674636913.3970938: Calculating correlations...
1674636914.142941: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674637060.858525: Feature importance and correlations results are saved!
Take time:  176.14069604873657
Train_score: 0.11950392748036408  Valid_score: 1.1589686753663397
-------------
Training - 8
Test on month 5 and 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674637106.96256: Calculating correlations...
1674637107.595371: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674637252.881: Feature importance and correlations results are saved!
Take time:  175.34633922576904
Train_score: 0.12050284056416805  Valid_score: 0.8891652273415266
-------------
Training - 9
Test on month 7 and 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674637296.6783218: Calculating correlations...
1674637297.434967: Calculated!
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
1674637443.632876: Feature importance and correlations results are saved!
Take time:  176.94623112678528
Train_score: 0.11973204074015498  Valid_score: 1.0813696997795912
-------------


In [11]:
ypred_test = np.mean([value for key, value in dict_result.items()], axis=0)

In [12]:
# pd.DataFrame(data = {"{}".format(target_): ypred_test, "index": test_index}).to_csv("submit.csv", index=False)

# Combine ft_imp_corr of mutiple models

In [88]:
result_2_path = 'result_2'

combined_ft_imp_corr_df = pd.DataFrame()

for idx, file_path in enumerate(os.listdir(result_2_path)):
    print('--------------')
    print(f'{idx}. Processing...')
    df_tmp = pd.read_csv(os.path.join(result_2_path, file_path))
    df_tmp['fold'] = idx

    combined_ft_imp_corr_df = pd.concat([combined_ft_imp_corr_df, df_tmp])

    print('Done')

--------------
0. Processing...
Done
--------------
1. Processing...
Done
--------------
2. Processing...
Done
--------------
3. Processing...
Done
--------------
4. Processing...
Done
--------------
5. Processing...
Done
--------------
6. Processing...
Done
--------------
7. Processing...
Done
--------------
8. Processing...
Done
--------------
9. Processing...
Done
--------------
10. Processing...
Done


In [89]:
combined_ft_imp_corr_df.to_csv(os.path.join(result_2_path, 'total_ft_imp_corr.csv'))