In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

path = "../data/"
! ls -l $path

total 3504
-rw-r--r--  1 ivan  staff   199229  2 27 17:03 predict.csv
-rw-r--r--  1 ivan  staff   180417  2 27 17:03 predict_B.csv
-rw-r--r--  1 ivan  staff    59383  2 27 17:26 predict_B1.csv
-rw-r--r--  1 ivan  staff    59214  2 27 17:28 predict_B2.csv
-rw-r--r--  1 ivan  staff    59357  2 27 17:28 predict_B3.csv
-rw-r--r--  1 ivan  staff    40016  2 27 17:03 sample_submission.csv
-rw-r--r--  1 ivan  staff    66119  2 27 17:03 sample_submission_B.csv
-rw-r--r--  1 ivan  staff  1112269  2 27 17:03 train.csv


In [2]:
%%time

import warnings
warnings.filterwarnings("ignore")

N = 100

train = pd.read_csv(f"{path}/train.csv")
testB1 = pd.read_csv(f"{path}/predict_B1.csv", sep="\t")
testB2 = pd.read_csv(f"{path}/predict_B2.csv", sep="\t")
testB3 = pd.read_csv(f"{path}/predict_B3.csv", sep="\t")

Xcols = []
Ycols = ["cv1", "cv2"]
for icol in ["dv1", "dv2", "mv1"]:
    for n in range(-N, N+1):
        for df in [train, testB1, testB2, testB3]:
            df[f"b0_{icol}_{n}"] = df[f"{icol}"].shift(n)
            df[f"f1_{icol}_{n}"] = [_1 - _2 for _1, _2 in zip(df[icol], df[f"b0_{icol}_{n}"])]
            df[f"f2_{icol}_{n}"] = [(_1 + _2)/2 for _1, _2 in zip(df[icol], df[f"b0_{icol}_{n}"])]
            df[f"f3_{icol}_{n}"] = [_1/_2 for _1, _2 in zip(df[icol], df[f"b0_{icol}_{n}"])]
        
        Xcols.extend([
            f"b0_{icol}_{n}", 
            f"f1_{icol}_{n}",
            f"f2_{icol}_{n}",
            f"f3_{icol}_{n}",
        ])

train_data = train[Xcols + Ycols].copy()


CPU times: user 3.28 s, sys: 138 ms, total: 3.41 s
Wall time: 3.44 s


In [7]:
%%time

from tqdm import tqdm
from sklearn.metrics import r2_score
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split


r = {}
for ycol in tqdm(Ycols):
    r[ycol] = []
    
    X, Y = train_data[Xcols], train_data[ycol]
    print(X.shape, Y.shape)
    
    x_train, x_tests, y_train, y_tests = train_test_split(
        X, Y, 
        test_size=0.1, 
        random_state=0
    )
    model = XGBRegressor(
        max_depth=6,
        learning_rate=0.1,
        n_estimators=100,
        objective='reg:squarederror',
        booster='gbtree',
        gamma=0,
        min_child_weight=1,
        subsample=1,
        colsample_bytree=1,
        reg_alpha=0,
        reg_lambda=1,
        random_state=0,
    )
    %time model.fit(x_train, y_train)
    
    print(f"{ycol} Train R2 {r2_score(y_true=y_train, y_pred=model.predict(x_train)):.6f}")
    print(f"{ycol} Tests R2 {r2_score(y_true=y_tests, y_pred=model.predict(x_tests)):.6f}")
    
    # 
    r[ycol].extend(model.predict(testB1[Xcols]))
    r[ycol].extend(model.predict(testB2[Xcols]))
    r[ycol].extend(model.predict(testB3[Xcols]))


  0%|          | 0/2 [00:00<?, ?it/s]

(20010, 972) (20010,)


 50%|█████     | 1/2 [00:40<00:40, 40.80s/it]

CPU times: user 4min 46s, sys: 4.73 s, total: 4min 51s
Wall time: 40.5 s
cv1 Train R2 0.992771
cv1 Tests R2 0.980071
(20010, 972) (20010,)


100%|██████████| 2/2 [01:24<00:00, 42.26s/it]

CPU times: user 5min 17s, sys: 5.89 s, total: 5min 22s
Wall time: 43.4 s
cv2 Train R2 0.978769
cv2 Tests R2 0.952419
CPU times: user 10min 5s, sys: 11.9 s, total: 10min 17s
Wall time: 1min 24s





In [8]:
result = pd.DataFrame()

result["cv1_1"] = r["cv1"]
result["cv1_2"] = pd.concat([testB1["cv1"], testB2["cv1"], testB3["cv1"]]).reset_index(drop=True)

result["cv2_1"] = r["cv2"]
result["cv2_2"] = pd.concat([testB1["cv2"], testB2["cv2"], testB3["cv2"]]).reset_index(drop=True)

result["cv1"] = [_1 if pd.isna(_2) else _2 for _1, _2 in zip(result["cv1_1"], result["cv1_2"])]
result["cv2"] = [_1 if pd.isna(_2) else _2 for _1, _2 in zip(result["cv2_1"], result["cv2_2"])]
result.head()


Unnamed: 0,cv1_1,cv1_2,cv2_1,cv2_2,cv1,cv2
0,324.973022,333.99,0.851923,0.67,333.99,0.67
1,325.765564,,0.872284,,325.765564,0.872284
2,326.275543,,0.854024,,326.275543,0.854024
3,328.982941,,0.877486,,328.982941,0.877486
4,330.937927,,0.850504,,330.937927,0.850504


In [9]:
result[Ycols].to_csv("submit.csv", index=False)
! md5sum -c submit.md5

submit.csv: FAILED


In [10]:
! md5sum submit.csv > submit.md5