In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

path = "../data/"
! ls -l $path

In [None]:
%%time

import warnings
warnings.filterwarnings("ignore")

N = 100

train = pd.read_csv(f"{path}/train.csv")
testB1 = pd.read_csv(f"{path}/predict_B1.csv", sep="\t")
testB2 = pd.read_csv(f"{path}/predict_B2.csv", sep="\t")
testB3 = pd.read_csv(f"{path}/predict_B3.csv", sep="\t")

Xcols = []
Ycols = ["cv1", "cv2"]
for icol in ["dv1", "dv2", "mv1"]:
    for n in range(-N, N+1):
        for df in [train, testB1, testB2, testB3]:
            df[f"base_{icol}_{n}"] = df[f"{icol}"].shift(n)
            df[f"gaps_{icol}_{n}"] = [_1 - _2 for _1, _2 in zip(df[icol], df[f"base_{icol}_{n}"])]
            df[f"sums_{icol}_{n}"] = [(_1-_2)/2 for _1, _2 in zip(df[icol], df[f"base_{icol}_{n}"])]
        
        Xcols.extend([
            f"base_{icol}_{n}", 
            f"gaps_{icol}_{n}",
            f"sums_{icol}_{n}",
        ])

train_data = train[Xcols + Ycols].copy()


In [None]:
train_data

In [None]:
%%time

from tqdm import tqdm
from sklearn.metrics import r2_score
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split


r = {}
for ycol in tqdm(Ycols):
    r[ycol] = []
    
    X, Y = train_data[Xcols], train_data[ycol]
    print(X.shape, Y.shape)
    
    x_train, x_tests, y_train, y_tests = train_test_split(
        X, Y, 
        test_size=0.2, 
        random_state=0
    )
    model = XGBRegressor(
        max_depth=6,
        learning_rate=0.1,
        n_estimators=100,
        objective='reg:squarederror',
        booster='gbtree',
        gamma=0,
        min_child_weight=1,
        subsample=1,
        colsample_bytree=1,
        reg_alpha=0,
        reg_lambda=1,
        random_state=0,
    )
    %time model.fit(x_train, y_train)
    
    print(f"{ycol} Train R2 {r2_score(y_true=y_train, y_pred=model.predict(x_train)):.6f}")
    print(f"{ycol} Tests R2 {r2_score(y_true=y_tests, y_pred=model.predict(x_tests)):.6f}")
    
    # 
    r[ycol].extend(model.predict(testB1[Xcols]))
    r[ycol].extend(model.predict(testB2[Xcols]))
    r[ycol].extend(model.predict(testB3[Xcols]))


In [None]:
result = pd.DataFrame()
result["cv1"] = r["cv1"]
result["cv2"] = r["cv2"]
result.head()

In [None]:
result.to_csv("submit.csv", index=False)
! md5sum -c submit.md5


In [None]:
# ! md5sum submit.csv > submit.md5