In [None]:
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
import lightgbm
# from boruta import BorutaPy
import pandas as pd
import numpy as np
# from fastai.tabular.core import df_shrink
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.linear_model import Ridge, LinearRegression
import time
import json

In [2]:
path_train = "data/train_data.csv"
path_test = "data/test_data.csv"
target_ = "contest-tmp2m-14d__tmp2m"

In [3]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def handle_idx_date(df, column_date, idx_name):
    def get_idx(lat, lon):
        return str(round(lat, 4)) + "_" + str(round(lon, 4))
    df[idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
    df[column_date] = pd.to_datetime(df[column_date])
    df['day_of_year'] = df[column_date].dt.day_of_year
    df['month'] = df[column_date].dt.month
      # encode the day with a period of 365
    df['day_of_year_sin'] = sin_transformer(365).fit_transform(df['day_of_year'])
    df['day_of_year_cos'] = cos_transformer(365).fit_transform(df['day_of_year'])

    # encode the month with a period of 12
    df['month_sin'] = sin_transformer(12).fit_transform(df['month'])
    df['month_cos'] = cos_transformer(12).fit_transform(df['month'])
    return df

def handle_feature_train_data(df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []
    
    for each in columns_cat:
        lbE = LabelEncoder().fit(df[each])
        df[each] = lbE.transform(df[each])
        list_lbEncoder.append(lbE)

    # fill nulls
    mean_df = df.mean()
    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")
    
    return df, list_lbEncoder, mean_df

def handle_feature_test_data(df, lbEncoder, mean_df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []
    
    for index, each in enumerate(columns_cat):
        df[each] = lbEncoder[index].transform(df[each])
        
    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")
    return df

In [None]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)
date_col = "startdate"
df_train[date_col] = pd.to_datetime(df_train[date_col])
df_test[date_col] = pd.to_datetime(df_test[date_col])

In [None]:
drop_col = pd.read_csv("Data/correlations_with_target_greater_0.7.csv")
drop_col = drop_col["col"].values
drop_col = [each for each in drop_col if "contest" not in each and "wind" not in each]

In [None]:
dict_result = {}

In [None]:
year_valid = [2015, 2015, 2016, 2016, 2016, 2016, 2016]
month_valid = [9, 11, 1, 3, 5, 7]

for index in range(len(year_valid)):
    if index in dict_result:
        continue
    y, m_s, m_e = year_valid[index], month_valid[index], month_valid[index] + 1
    X_valid = df_train[(df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
    X_train = df_train[~((df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]
    
    y_train = X_train[target_]
    X_train = X_train.drop(columns=["index", target_])
    y_valid = X_valid[target_]
    X_valid = X_valid.drop(columns=["index", target_])
    
    test_index = df_test["index"].values
    X_test = df_test.drop(columns=["index"]).copy()

    cat_cols = [i for i in X_train.select_dtypes(include='object').columns if i != date_col]
    X_train, X, listEncoder, df_mean = handle_feature_train_data(X_train, date_col, cat_cols.copy(), norm="none")
    X_valid = handle_feature_test_data(X_valid, listEncoder, df_mean, columns_cat=cat_cols.copy())
    X_test = handle_feature_test_data(X_test, listEncoder, df_mean, columns_cat=cat_cols.copy())
    
    drop_ = [*drop_col, *["month", "day_of_year", "day_of_year_sin", "day_of_year_cos", "month_sin", "month_cos"]]
    X_train = X_train.drop(columns=drop_)
    X_valid = X_valid.drop(columns=drop_)
    X_test = X_test.drop(columns=drop_)
    
    print("Training - {}".format(index))
    t= time.time()
    lgb = LGBMRegressor(max_depth = 15, n_estimators = 4999, subsample=0.75, colsample_bytree=0.75, verbose=0)
    lgb.fit(X_train, y_train, eval_metric="rmse")
    # lgb = CatBoostRegressor(verbose=200, cat_features=["climateregions__climateregion", "idx"])
    # lgb.fit(X_train, y_train)
    
    result_train = mean_squared_error(y_train, lgb.predict(X_train), squared=False)
    result_valid = mean_squared_error(y_valid, lgb.predict(X_valid), squared=False)
    ypred_test = lgb.predict(X_test)
    dict_result[index] = ypred_test
    print("Take time: ", time.time() - t)
    print("Train_score: {}  Valid_score: {}".format(result_train, result_valid))
    print("-------------")

In [None]:
ypred_test = np.mean([value for key, value in dict_result.items()], axis=0)

In [None]:
pd.DataFrame(data = {"{}".format(target_): ypred_test, "index": test_index}).to_csv("submit.csv", index=False)