In [47]:
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
import lightgbm
# from boruta import BorutaPy
import pandas as pd
import numpy as np
# from fastai.tabular.core import df_shrink
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.linear_model import Ridge, LinearRegression
import time
import matplotlib.pyplot as plt

In [48]:
path_train = "data/train_data.csv"
path_test = "data/test_data.csv"
target_ = "contest-tmp2m-14d__tmp2m"

In [49]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def handle_idx_date(df, column_date, idx_name):
    def get_idx(lat, lon):
        return str(round(lat, 4)) + "_" + str(round(lon, 4))
    df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
    df[column_date] = pd.to_datetime(df[column_date])
    df['day_of_year'] = df[column_date].dt.day_of_year
    df['month'] = df[column_date].dt.month
      # encode the day with a period of 365
    df['day_of_year_sin'] = sin_transformer(365).fit_transform(df['day_of_year'])
    df['day_of_year_cos'] = cos_transformer(365).fit_transform(df['day_of_year'])

    # encode the month with a period of 12
    df['month_sin'] = sin_transformer(12).fit_transform(df['month'])
    df['month_cos'] = cos_transformer(12).fit_transform(df['month'])
    return df

def handle_feature_train_data(df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []

    for each in columns_cat:
        lbE = LabelEncoder().fit(df[each])
        df[each] = lbE.transform(df[each])
        list_lbEncoder.append(lbE)

    # fill nulls
    mean_df = df.mean()
    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")

    return df, list_lbEncoder, mean_df

def handle_feature_test_data(df, lbEncoder, mean_df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []

    for index, each in enumerate(columns_cat):
        df[each] = lbEncoder[index].transform(df[each])

    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")
    return df

In [70]:
def calculate_corr_target(X_train, X_val):
    data_col = []
    data_corr_train = []
    data_corr_val = []

    for col in X_train.columns:
        corr_train = X_train['contest-tmp2m-14d__tmp2m'].corr(X_train[col])
        corr_val = X_val['contest-tmp2m-14d__tmp2m'].corr(X_val[col])

        data_col.append(col)
        data_corr_train.append(corr_train)
        data_corr_val.append(corr_val)

    corr = pd.DataFrame(data={'col': data_col, 'corr_train': data_corr_train, 'corr_val': data_corr_val})

    return corr

In [50]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

date_col = "startdate"

df_train[date_col] = pd.to_datetime(df_train[date_col])
df_test[date_col] = pd.to_datetime(df_test[date_col])

In [51]:
corr_df = pd.read_csv("data/correlations_with_target.csv").drop(columns='Unnamed: 0')
drop_col = corr_df["col"].values
drop_col = [each for each in drop_col if "contest" not in each and "wind" not in each]

In [None]:
dict_result = {}

year_valid = [2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016]
month_valid = [1, 3, 5, 7, 9, 11, 1, 3, 5, 7]

for index in range(len(year_valid)):
    if index in dict_result:
        continue
    y, m_s, m_e = year_valid[index], month_valid[index], month_valid[index] + 1

    print("Training - {}".format(index))
    print(f'Test on month {m_s} and {m_e}')

    X_valid = df_train[(df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
    X_train = df_train[~((df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]

    y_train = X_train[target_]
    # X_train = X_train.drop(columns=target_)
    y_valid = X_valid[target_]
    # X_valid = X_valid.drop(columns=target_)

    test_index = df_test["index"].values
    X_test = df_test.copy()

    cat_cols = [i for i in X_train.select_dtypes(include='object').columns if i != date_col]
    X_train, listEncoder, df_mean = handle_feature_train_data(X_train, date_col, cat_cols.copy())
    X_valid = handle_feature_test_data(X_valid, listEncoder, df_mean, columns_cat=cat_cols.copy())
    X_test = handle_feature_test_data(X_test, listEncoder, df_mean, columns_cat=cat_cols.copy())

    drop_ = [*drop_col, *["month", "day_of_year", "day_of_year_sin", "day_of_year_cos", "month_sin", "month_cos"]]
    X_train = X_train.drop(columns=drop_)
    X_valid = X_valid.drop(columns=drop_)
    X_test = X_test.drop(columns=drop_)

    print(f'{time.time()}: Calculating correlations...')
    corr_with_target = calculate_corr_target(X_train, X_valid)
    print(f'{time.time()}: Calculated!')

    X_train = X_train.drop(columns=target_)
    X_valid = X_valid.drop(columns=target_)

    t = time.time()
    lgb = LGBMRegressor(max_depth=15, n_estimators=4999, subsample=0.75, colsample_bytree=0.75, verbose=0, n_jobs=-1)
    lgb.fit(X_train, y_train, eval_metric="rmse")
    # lgb = CatBoostRegressor(verbose=200, cat_features=["climateregions__climateregion", "idx"])
    # lgb.fit(X_train, y_train)

    # feature importance
    importances = lgb.feature_importances_
    data = {'col': X_train.columns, 'imp': importances}
    ft_imp_df = pd.DataFrame(data)
    ft_imp_corr_df = pd.merge(ft_imp_df, corr_with_target, on='col', how='left')
    ft_imp_corr_df.to_csv(f'result_2/ft_imp_corr_{index}.csv', index=False)
    print(f'{time.time()}: Feature importance and correlations results are saved!')

    result_train = mean_squared_error(y_train, lgb.predict(X_train), squared=False)
    result_valid = mean_squared_error(y_valid, lgb.predict(X_valid), squared=False)
    ypred_test = lgb.predict(X_test)
    dict_result[index] = ypred_test
    print("Take time: ", time.time() - t)
    print("Train_score: {}  Valid_score: {}".format(result_train, result_valid))
    print("-------------")

Training - 0
Test on month 1 and 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = df[column_date].dt.day_of_year
A value is trying to be set on a copy of a sl

1674485941.898182: Calculating correlations...
1674485942.269355: Calculated!
You can set `force_col_wise=true` to remove the overhead.


In [66]:
X_valid.shape

(31354, 246)

In [11]:
ypred_test = np.mean([value for key, value in dict_result.items()], axis=0)

In [12]:
# pd.DataFrame(data = {"{}".format(target_): ypred_test, "index": test_index}).to_csv("submit.csv", index=False)

In [24]:
importances = lgb.feature_importances_

data = {'col': X_train.columns, 'imp': importances}

ft_imp_df = pd.DataFrame(data)
ft_imp_df.sort_values('imp', ascending=False)

Unnamed: 0,col,imp
203,idx,31571
50,contest-prwtr-eatm-14d__prwtr,4824
55,contest-wind-h500-14d__wind-hgt-500,4322
2,contest-pevpr-sfc-gauss-14d__pevpr,4177
36,contest-slp-14d__slp,3701
...,...,...
137,icec-2010-5,121
98,mjo1d__phase,107
100,mei__mei,65
101,mei__meirank,11


In [38]:
corr_df = pd.read_csv("data/correlations_with_target.csv")
corr_df.drop(columns='Unnamed: 0', inplace=True)

In [39]:
pd.merge(ft_imp_df, corr_df, on='col', how='left')

Unnamed: 0,col,imp,corr
0,lat,739,-0.398388
1,lon,619,0.092923
2,contest-pevpr-sfc-gauss-14d__pevpr,3973,0.805301
3,contest-wind-h10-14d__wind-hgt-10,1398,0.763524
4,contest-rhum-sig995-14d__rhum,1528,-0.565127
...,...,...,...
199,wind-vwnd-925-2010-17,440,0.022358
200,wind-vwnd-925-2010-18,485,0.284506
201,wind-vwnd-925-2010-19,458,-0.072492
202,wind-vwnd-925-2010-20,544,-0.127422
