In [1]:
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
import lightgbm
# from boruta import BorutaPy
import pandas as pd
import numpy as np
# from fastai.tabular.core import df_shrink
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso, LassoCV
import time
from datetime import datetime
import matplotlib.pyplot as plt
import os

In [2]:
path_train = "data/train_data.csv"
path_test = "data/test_data.csv"
target_ = "contest-tmp2m-14d__tmp2m"

In [3]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

date_col = "startdate"

df_train[date_col] = pd.to_datetime(df_train[date_col])
df_test[date_col] = pd.to_datetime(df_test[date_col])

# Pre-processing

In [4]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def fourier_series(dates, period, series_order):
    t = np.array(
        (dates - datetime(1970, 1, 1))
            .dt.total_seconds()
            .astype(float)
    ) / (3600 * 24.)

    return np.column_stack([
        fun((2.0 * (i + 1) * np.pi * t / period))
        for i in range(series_order)
        for fun in (np.sin, np.cos)
    ])

def make_seasonality_features(dates, period, series_order, prefix):
    features = fourier_series(dates, period, series_order)
    columns = [
        '{}_delim_{}'.format(prefix, i + 1)
        for i in range(features.shape[1])
    ]
    return pd.DataFrame(features, columns=columns)

def fourier_transform(df):
    def reconvert(df):
        if not os.path.exists('delete_later'):
            os.mkdir('delete_later')

        df.to_csv('delete_later/df.csv')
        df = pd.read_csv('delete_later/df.csv')

        return df

    if not os.path.exists('fourier'):
        os.mkdir('fourier')

    seasonalities = {
        'monthly': {
            'period': 30.5,
            'fourier_order': 10,
        }
    }

    for name, props in seasonalities.items():

        df['startdate']  = pd.to_datetime(df['startdate'])

        t = np.array(
            (df['startdate'] - datetime(1970, 1, 1))
                .dt.total_seconds()
                .astype(float)
        ) / (3600 * 24.)

        series_order = props['fourier_order']
        period = props['period']

        s = np.column_stack([
                fun((2.0 * (i + 1) * np.pi * t / period))
                for i in range(series_order)
                for fun in (np.sin, np.cos)
            ])

        columns = [
            '{}_delim_{}'.format(name, i + 1)
            for i in range(s.shape[1])
        ]

        features = pd.DataFrame(s, columns=columns)

        df = pd.concat([reconvert(df), reconvert(features)], axis=1)

    return df

def handle_idx_date(df, column_date, idx_name):
    def get_idx(lat, lon):
        return str(round(lat, 4)) + "_" + str(round(lon, 4))

    df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
    df.loc[:, column_date] = pd.to_datetime(df[column_date])
    # df.loc[:, 'day_of_year'] = df[column_date].dt.day_of_year
    # df.loc[:, 'month'] = df[column_date].dt.month
    #   # encode the day with a period of 365
    # df.loc[:, 'day_of_year_sin'] = sin_transformer(365).fit_transform(df['day_of_year'])
    # df.loc[:, 'day_of_year_cos'] = cos_transformer(365).fit_transform(df['day_of_year'])
    #
    # # encode the month with a period of 12
    # df.loc[:, 'month_sin'] = sin_transformer(12).fit_transform(df['month'])
    # df.loc[:, 'month_cos'] = cos_transformer(12).fit_transform(df['month'])

    df = fourier_transform(df)

    return df

def handle_feature_train_data(df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []

    for each in columns_cat:
        lbE = LabelEncoder().fit(df[each])
        df[each] = lbE.transform(df[each])
        list_lbEncoder.append(lbE)

    # fill nulls
    mean_df = df.mean()
    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")

    return df, list_lbEncoder, mean_df

def handle_feature_test_data(df, lbEncoder, mean_df, column_date="startdate", columns_cat = [], idx_name="idx"):
    df = handle_idx_date(df, column_date, idx_name)
    df = df.drop(columns = [column_date])
    columns_cat.append(idx_name)
    list_lbEncoder = []

    for index, each in enumerate(columns_cat):
        df[each] = lbEncoder[index].transform(df[each])

    df = df.fillna(mean_df)
    df[columns_cat] = df[columns_cat].astype("category")

    return df

In [5]:
def reconvert(df):
    if not os.path.exists('delete_later'):
        os.mkdir('delete_later')

    df.to_csv('delete_later/df.csv')
    df = pd.read_csv('delete_later/df.csv')

    return df

In [6]:
def calculate_corr_target(X_train, X_val):
    data_col = []
    data_corr_train = []
    data_corr_val = []

    for col in X_train.columns:
        corr_train = X_train['contest-tmp2m-14d__tmp2m'].corr(X_train[col])
        corr_val = X_val['contest-tmp2m-14d__tmp2m'].corr(X_val[col])

        data_col.append(col)
        data_corr_train.append(corr_train)
        data_corr_val.append(corr_val)

    corr = pd.DataFrame(data={'col': data_col, 'corr_train': data_corr_train, 'corr_val': data_corr_val})

    return corr

In [7]:
corr_df = pd.read_csv("data/correlations_with_target.csv") \
            .drop(columns='Unnamed: 0')
corr_df = corr_df[(corr_df['corr'] >= 0.7)  | (corr_df['corr'] <= -0.7)]
drop_col = corr_df["col"].values
drop_col = [each for each in drop_col if "contest" not in each and "wind" not in each]

In [8]:
dict_result = {}

year_valid = [2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016]
month_valid = [1, 3, 5, 7, 9, 11, 1, 3, 5, 7]

for index in range(len(year_valid)):
    t = time.time()

    if index in dict_result:
        continue
    y, m_s, m_e = year_valid[index], month_valid[index], month_valid[index] + 1

    print("Training - {}".format(index))
    print(f'Test on month {m_s} and {m_e}')

    X_valid = df_train[(df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e)]
    X_train = df_train[~((df_train[date_col].dt.year == y) & (df_train[date_col].dt.month >= m_s) & (df_train[date_col].dt.month <= m_e))]

    y_train = X_train[target_]
    y_valid = X_valid[target_]


    test_index = df_test["index"].values
    X_test = df_test.copy()

    cat_cols = [i for i in X_train.select_dtypes(include='object').columns if i != date_col]
    X_train, listEncoder, df_mean = handle_feature_train_data(X_train, date_col, cat_cols.copy())
    X_valid = handle_feature_test_data(X_valid, listEncoder, df_mean, columns_cat=cat_cols.copy())
    X_test = handle_feature_test_data(X_test, listEncoder, df_mean, columns_cat=cat_cols.copy())

    drop_ = [*drop_col, *[col for col in X_train.columns if 'Unnamed' in col]]
    X_train = X_train.drop(columns=drop_)
    X_valid = X_valid.drop(columns=drop_)
    X_test = X_test.drop(columns=drop_)

    X_train = X_train.drop(columns=target_)
    X_valid = X_valid.drop(columns=target_)

    # scaler
    transformer = RobustScaler()
    transformer.fit_transform(X_train)
    transformer.transform(X_valid)
    transformer.transform(X_test)

    reg = LassoCV(cv=5, max_iter=3000, random_state=0)
    reg.fit(X_train, y_train)

    ytrain_pred = reg.predict(X_train)
    result_train = mean_squared_error(y_train, ytrain_pred)
    ypred_test = reg.predict(X_valid)
    result_valid = mean_squared_error(y_valid, ypred_test, squared=False)

    ypred_test = reg.predict(X_test)
    dict_result[index] = ypred_test

    # dict_result[index] = ypred_test
    print("Take time: ", time.time() - t)
    print("Train_score: {}  Valid_score: {}".format(result_train, result_valid))
    print("-------------")

Training - 0
Test on month 1 and 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  143.03781080245972
Train_score: 10.116051410772352  Valid_score: 4.950156849750542
-------------
Training - 1
Test on month 3 and 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  121.04343509674072
Train_score: 11.493945375547169  Valid_score: 3.314150712414819
-------------
Training - 2
Test on month 5 and 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  121.27885007858276
Train_score: 11.349303613932504  Valid_score: 3.3172216453739813
-------------
Training - 3
Test on month 7 and 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  187.11014413833618
Train_score: 11.524647182297587  Valid_score: 2.7798192012728915
-------------
Training - 4
Test on month 9 and 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  145.65025925636292
Train_score: 11.794010658609345  Valid_score: 2.315858859912482
-------------
Training - 5
Test on month 11 and 12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  188.73757910728455
Train_score: 11.079706483542184  Valid_score: 3.7435538997300095
-------------
Training - 6
Test on month 1 and 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  150.0498330593109
Train_score: 10.918692428223933  Valid_score: 3.8738813367195863
-------------
Training - 7
Test on month 3 and 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  107.73490786552429
Train_score: 11.664956809404872  Valid_score: 2.6961473277267984
-------------
Training - 8
Test on month 5 and 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  101.77973699569702
Train_score: 11.466084525078346  Valid_score: 2.937220279899853
-------------
Training - 9
Test on month 7 and 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, idx_name] = np.vectorize(get_idx)(df['lat'], df['lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
  df.loc[:, column_date] = pd.to_datetime(df[column_date])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startdate']  = pd.t

Take time:  101.19999098777771
Train_score: 11.374100367777842  Valid_score: 3.003230468900311
-------------


In [9]:
ypred_test = np.mean([value for key, value in dict_result.items()], axis=0)

In [10]:
pd.DataFrame(data = {"{}".format(target_): ypred_test, "index": test_index}).to_csv("submit_lasso.csv", index=False)