# This is the start of the actual research notebook

### current goals:

- build a functional predictor using lightgbm
- find areas for improvement

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import lightgbm as lgb

### Load and prepare data


In [2]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

x = train_data.drop(columns=["Y1", "Y2"])
y1 = train_data["Y1"]
y2 = train_data["Y2"]

x_train, x_val, y1_train, y1_val = train_test_split(x, y1, test_size=0.25, shuffle=False)
_, _, y2_train, y2_val = train_test_split(x, y2, test_size=0.25, shuffle=False)

### Train model

In [9]:
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

# model for Y1
y1_train_data = lgb.Dataset(x_train, label=y1_train)
y1_val_data = lgb.Dataset(x_val, label=y1_val, reference=y1_train_data)

model_y1 = lgb.train(
    params,
    y1_train_data,
    valid_sets=[y1_train_data, y1_val_data],
    num_boost_round=2000,
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(200)
    ]
)

# model for Y2
y2_train_data = lgb.Dataset(x_train, label=y2_train)
y2_val_data = lgb.Dataset(x_val, label=y2_val, reference=y2_train_data)

model_y2 = lgb.train(
    params,
    y2_train_data,
    valid_sets=[y2_train_data, y2_val_data],
    num_boost_round=2000,
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(200)
    ]
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[61]	training's rmse: 0.388537	valid_1's rmse: 0.634087
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[143]	training's rmse: 0.306524	valid_1's rmse: 0.676204


### Evaluation and Prediction

In [10]:
val_pred_y1 = model_y1.predict(x_val)
val_pred_y2 = model_y2.predict(x_val)

print("R2 for Y1: ", r2_score(y1_val, val_pred_y1))
print("R2 for Y2: ", r2_score(y2_val, val_pred_y2))

R2 for Y1:  0.7083220716763224
R2 for Y2:  0.6419106628032614


In [11]:
test_pred_y1 = model_y1.predict(test_data.drop(columns=["id"]))
test_pred_y2 = model_y2.predict(test_data.drop(columns=["id"]))

### Submission data

In [25]:
submission = pd.DataFrame({
    "id": test_data["id"],
    "Y1": test_pred_y1,
    "Y2": test_pred_y2
})

import datetime
import os
now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

path = os.path.join(r"submissions", f"submission{now}.csv")
submission.to_csv(path, index=False)
print("saved to csv file")

saved to csv file
