In [1]:
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
import lightgbm
# from boruta import BorutaPy
import pandas as pd
import numpy as np
# from fastai.tabular.core import df_shrink
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.linear_model import Ridge, LinearRegression
import time
from datetime import datetime
import matplotlib.pyplot as plt
import os

In [2]:
path_train = "data/train_data.csv"
path_test = "data/test_data.csv"
target_ = "contest-tmp2m-14d__tmp2m"

In [3]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

date_col = "startdate"

df_train[date_col] = pd.to_datetime(df_train[date_col])
df_test[date_col] = pd.to_datetime(df_test[date_col])

In [27]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def fourier_series(dates, period, series_order):
    """Provides Fourier series components with the specified frequency
    and order.
    Parameters
    ----------
    dates: pd.Series containing timestamps.
    period: Number of days of the period.
    series_order: Number of components.
    Returns
    -------
    Matrix with seasonality features.
    """
    # convert to days since epoch
    t = np.array(
        (dates - datetime(1970, 1, 1))
            .dt.total_seconds()
            .astype(float)
    ) / (3600 * 24.)

    return np.column_stack([
        fun((2.0 * (i + 1) * np.pi * t / period))
        for i in range(series_order)
        for fun in (np.sin, np.cos)
    ])

def make_seasonality_features(dates, period, series_order, prefix):
    """Data frame with seasonality features.
    Parameters
    ----------
    cls: Prophet class.
    dates: pd.Series containing timestamps.
    period: Number of days of the period.
    series_order: Number of components.
    prefix: Column name prefix.
    Returns
    -------
    pd.DataFrame with seasonality features.
    """
    features = fourier_series(dates, period, series_order)
    columns = [
        '{}_delim_{}'.format(prefix, i + 1)
        for i in range(features.shape[1])
    ]
    return pd.DataFrame(features, columns=columns)

def fourier_transform(df):

    def reconvert(df):
        if not os.path.exists('delete_later'):
            os.mkdir('delete_later')

        df.to_csv('delete_later/df.csv')
        df = pd.read_csv('delete_later/df.csv')

        return df

    if not os.path.exists('fourier'):
        os.mkdir('fourier')

    seasonalities = {
        'daily': {
            'period': 1,
            'fourier_order': 4,
        }, 'weekly': {
            'period': 7,
            'fourier_order': 3,
        }, 'monthly': {
            'period': 30.5,
            'fourier_order': 10,
        }, 'yearly': {
            'period': 365.5,
            'fourier_order': 10,
        }}

    for name, props in seasonalities.items():

        df['startdate']  = pd.to_datetime(df['startdate'])

        t = np.array(
            (df['startdate'] - datetime(1970, 1, 1))
                .dt.total_seconds()
                .astype(float)
        ) / (3600 * 24.)

        series_order = props['fourier_order']
        period = props['period']

        s = np.column_stack([
                fun((2.0 * (i + 1) * np.pi * t / period))
                for i in range(series_order)
                for fun in (np.sin, np.cos)
            ])

        columns = [
            '{}_delim_{}'.format(name, i + 1)
            for i in range(s.shape[1])
        ]

        features = pd.DataFrame(s, columns=columns)

        df = pd.concat([reconvert(df), reconvert(features)], axis=1)

    return df

def handle_idx_date(df, column_date, idx_name):
    def get_idx(lat, lon):
        return str(round(lat, 4)) + "_" + str(round(lon, 4))

    df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
    df.loc[:, column_date] = pd.to_datetime(df[column_date])
    df.loc[:, 'day_of_year'] = df[column_date].dt.day_of_year
    df.loc[:, 'month'] = df[column_date].dt.month
      # encode the day with a period of 365
    df.loc[:, 'day_of_year_sin'] = sin_transformer(365).fit_transform(df['day_of_year'])
    df.loc[:, 'day_of_year_cos'] = cos_transformer(365).fit_transform(df['day_of_year'])

    # encode the month with a period of 12
    df.loc[:, 'month_sin'] = sin_transformer(12).fit_transform(df['month'])
    df.loc[:, 'month_cos'] = cos_transformer(12).fit_transform(df['month'])

    df = fourier_transform(df)

    return df

def handle_feature_train_data(df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []

    for each in columns_cat:
        lbE = LabelEncoder().fit(df[each])
        df[each] = lbE.transform(df[each])
        list_lbEncoder.append(lbE)

    # fill nulls
    mean_df = df.mean()
    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")

    return df, list_lbEncoder, mean_df

def handle_feature_test_data(df, lbEncoder, mean_df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []

    for index, each in enumerate(columns_cat):
        df[each] = lbEncoder[index].transform(df[each])

    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")

    return df

# Test

In [13]:
X_train = df_train[~((df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]
t = np.array(
        (X_train['startdate'] - datetime(1970, 1, 1))
            .dt.total_seconds()
            .astype(float)
    ) / (3600 * 24.)

In [15]:
seasonalities = {
    'daily': {
        'period': 1,
        'fourier_order': 4,
    }, 'weekly': {
        'period': 7,
        'fourier_order': 3,
    }, 'monthly': {
        'period': 30.5,
        'fourier_order': 15,
    }, 'yearly': {
        'period': 365.5,
        'fourier_order': 10,
    }
}

date_col = 'startdate'
y = 2015
m_s = 1
m_e = 2

X_valid = df_train[(df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
X_train = df_train[~((df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]

In [17]:
def reconvert(df):
    if not os.path.exists('delete_later'):
        os.mkdir('delete_later')

    df.to_csv('delete_later/df.csv')
    df = pd.read_csv('delete_later/df.csv')

    return df

In [25]:
def calculate_corr_target(X_train, X_val):
    data_col = []
    data_corr_train = []
    data_corr_val = []

    for col in X_train.columns:
        corr_train = X_train['contest-tmp2m-14d__tmp2m'].corr(X_train[col])
        corr_val = X_val['contest-tmp2m-14d__tmp2m'].corr(X_val[col])

        data_col.append(col)
        data_corr_train.append(corr_train)
        data_corr_val.append(corr_val)

    corr = pd.DataFrame(data={'col': data_col, 'corr_train': data_corr_train, 'corr_val': data_corr_val})

    return corr

In [39]:
corr_df = pd.read_csv("data/correlations_with_target.csv") \
            .drop(columns='Unnamed: 0')
corr_df = corr_df[(corr_df['corr'] >= 0.7)  | (corr_df['corr'] <= -0.7)]
drop_col = corr_df["col"].values
drop_col = [each for each in drop_col if "contest" not in each and "wind" not in each]

In [41]:
dict_result = {}

year_valid = [2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016]
month_valid = [1, 3, 5, 7, 9, 11, 1, 3, 5, 7]

for index in range(len(year_valid)):
    if index in dict_result:
        continue
    y, m_s, m_e = year_valid[index], month_valid[index], month_valid[index] + 1

    print("Training - {}".format(index))
    print(f'Test on month {m_s} and {m_e}')

    X_valid = df_train[(df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
    X_train = df_train[~((df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]

    y_train = X_train[target_]
    # X_train = X_train.drop(columns=target_)
    y_valid = X_valid[target_]
    # X_valid = X_valid.drop(columns=target_)

    test_index = df_test["index"].values
    X_test = df_test.copy()

    cat_cols = [i for i in X_train.select_dtypes(include='object').columns if i != date_col]
    X_train, listEncoder, df_mean = handle_feature_train_data(X_train, date_col, cat_cols.copy())
    X_valid = handle_feature_test_data(X_valid, listEncoder, df_mean, columns_cat=cat_cols.copy())
    X_test = handle_feature_test_data(X_test, listEncoder, df_mean, columns_cat=cat_cols.copy())

    drop_ = [*drop_col, *["month", "day_of_year", "day_of_year_sin", "day_of_year_cos", "month_sin", "month_cos"],
             *[col for col in X_train.columns if 'Unnamed' in col]]
    X_train = X_train.drop(columns=drop_)
    X_valid = X_valid.drop(columns=drop_)
    X_test = X_test.drop(columns=drop_)

    # print(f'{time.time()}: Calculating correlations...')
    # corr_with_target = calculate_corr_target(X_train, X_valid)
    # print(f'{time.time()}: Calculated!')

    X_train = X_train.drop(columns=target_)
    X_valid = X_valid.drop(columns=target_)

    t = time.time()
    lgb = LGBMRegressor(max_depth=15, n_estimators=4999, subsample=0.75, colsample_bytree=0.75, verbose=0, n_jobs=4)
    lgb.fit(X_train, y_train, eval_metric="rmse")

    # # feature importance
    # importances = lgb.feature_importances_
    # data = {'col': X_train.columns, 'imp': importances}
    # ft_imp_df = pd.DataFrame(data)
    # ft_imp_corr_df = pd.merge(ft_imp_df, corr_with_target, on='col', how='left')
    # ft_imp_corr_df.to_csv(f'result_2/ft_imp_corr_{index}.csv', index=False)
    # print(f'{time.time()}: Feature importance and correlations results are saved!')

    result_train = mean_squared_error(y_train, lgb.predict(X_train), squared=False)
    result_valid = mean_squared_error(y_valid, lgb.predict(X_valid), squared=False)
    ypred_test = lgb.predict(X_test)
    dict_result[index] = ypred_test
    print("Take time: ", time.time() - t)
    print("Train_score: {}  Valid_score: {}".format(result_train, result_valid))
    print("-------------")

Training - 0
Test on month 1 and 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  198.26982021331787
Train_score: 0.12537691781999827  Valid_score: 1.730665004830701
-------------
Training - 1
Test on month 3 and 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  193.1871190071106
Train_score: 0.12567755923751495  Valid_score: 1.2872121775093246
-------------
Training - 2
Test on month 5 and 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  200.1215271949768
Train_score: 0.1273184976239662  Valid_score: 0.8575639765132019
-------------
Training - 3
Test on month 7 and 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  209.8626687526703
Train_score: 0.12942281185565632  Valid_score: 0.7254012549374097
-------------
Training - 4
Test on month 9 and 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  201.16621208190918
Train_score: 0.12764954868083653  Valid_score: 0.8723171738319786
-------------
Training - 5
Test on month 11 and 12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  199.46020984649658
Train_score: 0.12348121734274817  Valid_score: 1.7769127596599223
-------------
Training - 6
Test on month 1 and 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  197.09385085105896
Train_score: 0.12383874717340487  Valid_score: 1.6657940837036274
-------------
Training - 7
Test on month 3 and 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  198.70471501350403
Train_score: 0.12702027218256765  Valid_score: 1.2484727322408335
-------------
Training - 8
Test on month 5 and 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  201.82489585876465
Train_score: 0.12663211840034166  Valid_score: 0.8418620363065009
-------------
Training - 9
Test on month 7 and 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_year'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Take time:  201.74129223823547
Train_score: 0.12794722877563  Valid_score: 0.9245358214670082
-------------


In [34]:
ypred_test = np.mean([value for key, value in dict_result.items()], axis=0)

In [35]:
pd.DataFrame(data = {"{}".format(target_): ypred_test, "index": test_index}).to_csv("submit_with_fourier.csv", index=False)

In [36]:
importances = lgb.feature_importances_

data = {'col': X_train.columns, 'imp': importances}

ft_imp_df = pd.DataFrame(data)
ft_imp_df.sort_values('imp', ascending=False)

Unnamed: 0,col,imp
204,idx,32064
51,contest-prwtr-eatm-14d__prwtr,4031
56,contest-wind-h500-14d__wind-hgt-500,3954
3,contest-pevpr-sfc-gauss-14d__pevpr,3848
16,contest-wind-h100-14d__wind-hgt-100,3779
...,...,...
212,daily_delim_8,0
210,daily_delim_6,0
208,daily_delim_4,0
206,daily_delim_2,0


In [38]:
corr_df = pd.read_csv("data/correlations_with_target.csv")
corr_df.drop(columns='Unnamed: 0', inplace=True)

In [39]:
pd.merge(ft_imp_df, corr_df, on='col', how='left')

Unnamed: 0,col,imp,corr
0,lat,739,-0.398388
1,lon,619,0.092923
2,contest-pevpr-sfc-gauss-14d__pevpr,3973,0.805301
3,contest-wind-h10-14d__wind-hgt-10,1398,0.763524
4,contest-rhum-sig995-14d__rhum,1528,-0.565127
...,...,...,...
199,wind-vwnd-925-2010-17,440,0.022358
200,wind-vwnd-925-2010-18,485,0.284506
201,wind-vwnd-925-2010-19,458,-0.072492
202,wind-vwnd-925-2010-20,544,-0.127422
