In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, ElasticNet, LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tree_based_models import kfold_general_with_residuals, ResidualModel
from data_engineering import feature_engineering as fe

## Load Data

In [2]:
train = pd.read_csv("data/train.csv")
X_val = pd.read_csv("data/X_val.csv")
y_val = pd.read_csv("data/y_val.csv")

## Features Names

In [3]:
RET_features = [f"RET_{i}" for i in range(1, 21)]
SIGNED_VOLUME_features = [f"SIGNED_VOLUME_{i}" for i in range(1, 21)]
TURNOVER_features = ["AVG_DAILY_TURNOVER"]

window_sizes = [3, 5, 10, 15, 20]

## Feature Engineering

In [4]:
def feature_engineering(
    X: pd.DataFrame,
) -> pd.DataFrame:
    X = X.pipe(
        fe.add_average_perf_features,
        RET_features=RET_features,
        window_sizes=window_sizes,
        group_col="TS",
    )

    return X


X_feat = feature_engineering(train)

## Define Features for model

In [5]:
features = [
    col
    for col in X_feat.columns
    if col not in ["ROW_ID", "TS", "target"] + SIGNED_VOLUME_features
]
features_res = features

## Define Model Parameters

In [6]:
target_name = "target"

linear_params = {
    "fit_intercept": True,
    "positive": True,
}


ridge_params = {
    "alpha": 1e-2,
    "fit_intercept": True,
    "random_state": 42,
}

ridge_params_2 = {
    "alpha": 100,
    "fit_intercept": True,
    "random_state": 42,
}

xgb_params = {
    "n_estimators": 100,
    "max_depth": 5,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42,
}

rf_params = {
    "n_estimators": 50,
    "max_depth": 5,
    "min_samples_split": 5,
    "min_samples_leaf": 3,
    "max_features": "sqrt",
    "random_state": 42,
    "n_jobs": -1,
}

general_model_cls = Ridge
general_params = ridge_params
residual_model_cls = Ridge
residual_params = ridge_params_2

##  Model Evaluation using Cross Validation

In [7]:
metrics = kfold_general_with_residuals(
    data=train,
    target=target_name,
    features=features,
    features_res=features_res,
    unique_id="TS",
    feat_engineering=feature_engineering,
    n_splits=5,
    general_model_cls=general_model_cls,
    general_params=general_params,
    residual_model_cls=residual_model_cls,
    residual_params=residual_params,
)

Fold 1 - Acc: 51.97%
Fold 2 - Acc: 52.39%
Fold 3 - Acc: 52.53%
Fold 4 - Acc: 52.28%
Fold 5 - Acc: 53.05%

Accuracy: 52.44% (Â± 0.36%) [Min: 51.97% ; Max: 53.05%]


## Model Training and prediction

In [8]:
if feature_engineering:
    train = feature_engineering(train)
    X_val = feature_engineering(X_val)

X_train = train[features]
y_train = train[target_name]

res_model = ResidualModel(
    general_model_cls=general_model_cls,
    general_params=general_params,
    residual_model_cls=residual_model_cls,
    residual_params=residual_params,
)
res_model.fit(train, target_name, features, features_res)

In [9]:
y_pred_val = res_model.predict(X_val, features, features_res)

# Accuracy on sign (>0)
y_true_bin = (y_val[target_name] > 0).astype(int)
y_pred_bin = (y_pred_val > 0).astype(int)

print("Residual Model accuracy:", accuracy_score(y_true_bin, y_pred_bin))

Residual Model accuracy: 0.5308245711123409


## Model Prediction on test

In [10]:
X_test = pd.read_csv("data/X_test.csv")

for i in range(1, X_test.shape[1] - 1):  # FillNA Variable
    col = X_test.columns[i]
    left = X_test.columns[i - 1]
    right = X_test.columns[i + 1]
    if X_test[col].isna().sum() > 0:
        print(left, right)
        X_test[col] = X_test[col].fillna((X_test[left] + X_test[right]) / 2)


if feature_engineering:
    X_test = feature_engineering(X_test)


preds_sub = res_model.predict(X_test, features, features_res)
preds_sub = pd.DataFrame(preds_sub, index=X_test["ROW_ID"], columns=[target_name])

(preds_sub > 0).astype(int).to_csv("predictions/preds_res_model_last.csv")
print("Positive rate:", (preds_sub > 0).mean().values[0])

RET_6 RET_4
SIGNED_VOLUME_7 SIGNED_VOLUME_5
Positive rate: 0.5211376858435682
