In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [None]:
%%time
# Read the training dataset...
trn_df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
sample_df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')
reveal_df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')

In [None]:
%%time
# Review the dataset information, exploring variables and size...
trn_df.info()

In [None]:
trn_df.head()

In [None]:
%%time
# Identify missing data...
trn_df.isnull().sum()

In [None]:
%%time
# Drop the rows that have missing data for the target variable
trn_df = trn_df.dropna(subset=['target'])

In [None]:
# Count the number of missing values per line
# And compute the frequencies observed
aux = trn_df.isnull().sum(axis=1).values
for i in np.unique(aux):
    print(i, np.sum(aux == i))

In [None]:
# Compute the median of the response variable for rows with 8  missing values
pred_with_8nas = np.median(trn_df[['target']].values[aux == 8])
pred_with_8nas

In [None]:
%%time
# Drop the rows that have 8 missing values
print(trn_df.shape)
trn_df = trn_df.drop(trn_df.index[aux == 8])
print(trn_df.shape)

In [None]:
# Drop the same rows from the counts and create a new feature with the number 
# of missing values per row
auxb = aux[aux != 8]
len(auxb)
trn_df['nna'] = auxb

In [None]:
trn_df.describe()

In [None]:
%%time
# Create a dictionary with the value to be inputed in the places with missing values
# (for each feature)
# In the train set only the 'far_price' and 'near_price' features have missings, 
# but it is better to be prepared for different situation in the test set
# The inputation values are around the mean for all the features, except for 
# 'far_price' and 'near_price', where a value outside the range is chosen to be used as 
# missing flag

values = {'imbalance_size': 0, 'imbalance_buy_sell_flag': 0, 
        'reference_price': 1, 
        'matched_size': np.nanmean(trn_df.matched_size.values),
        'far_price': 0, 'near_price': 0, 
        'bid_price': 1, 
        'bid_size': np.nanmean(trn_df.bid_size.values), 
        'ask_price': 1,
        'ask_size': np.nanmean(trn_df.ask_size.values), 
        'wap': 1}

trn_df = trn_df.fillna(value=values)

In [None]:
#plt.plot(trn_df[['date_id']], trn_df[['imbalance_size']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['imbalance_buy_sell_flag']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['reference_price']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['matched_size']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['far_price']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['near_price']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['bid_price']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['bid_size']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['ask_price']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['ask_size']].expanding(1).mean());
#plt.plot(trn_df[['date_id']], trn_df[['wap']].expanding(1).mean());
#plt.plot(trn_df[['time_id']][:100000], trn_df[['far_price']].isnull().expanding(1).mean()[:100000]);
#plt.plot(trn_df[['time_id']][:100000], trn_df[['near_price']].isnull().expanding(1).mean()[:100000]);

#plt.figure();
#plt.plot(trn_df[['time_id']][:100000], trn_df[['far_price']].isnull().expanding(1).std()[:100000]);
#plt.plot(trn_df[['time_id']][:100000], trn_df[['near_price']].isnull().expanding(1).std()[:100000]);
#plt.figure()
#plt.plot(trn_df[['time_id']][:100000], trn_df[['far_price']].isnull().expanding(1).mean()[:100000]);
#plt.plot(trn_df[['time_id']][:100000], trn_df[['near_price']].isnull().expanding(1).mean()[:100000]);

#plt.plot(trn_df[['time_id']][:10000], trn_df[['far_price']].isnull()[:10000]);
#plt.plot(trn_df[['time_id']][:10000], trn_df[['near_price']].isnull()[:10000]);

#plt.figure();
#plt.plot(trn_df[['time_id']][:10000]);
#plt.hist(trn_df[['time_id']][:10000], 200);

In [None]:
#trn_df[['time_id']].values[trn_df[['stock_id']].values == 0]

In [None]:
qlim = np.quantile(trn_df[['target']].values.flatten(), [0.01, 0.99])
y = np.minimum(np.maximum(trn_df[['target']].values.flatten(), qlim[0]), qlim[1])
qlim

In [None]:
itrain = np.arange(len(y))

In [None]:
%%time
k = 2 * np.pi * 4 / 540
X = np.array([np.ones((len(y))), trn_df[['wap']].values.flatten(), # 0, 1
          trn_df[['ask_size']].values.flatten(), trn_df[['bid_size']].values.flatten(), # 2, 3
          trn_df[['ask_price']].values.flatten(), trn_df[['bid_price']].values.flatten(), # 4, 5
          trn_df[['near_price']].values.flatten(), trn_df[['far_price']].values.flatten(), # 6, 7
          trn_df[['matched_size']].values.flatten(), trn_df[['reference_price']].values.flatten(), # 8, 9
          trn_df[['imbalance_size']].values.flatten() * trn_df[['imbalance_buy_sell_flag']].values.flatten(), # 10
          np.cos(trn_df[['seconds_in_bucket']].values.flatten() * k), np.sin(trn_df[['seconds_in_bucket']].values.flatten() * k), # 11, 12
          np.cos(trn_df[['seconds_in_bucket']].values.flatten() * k / 2), np.sin(trn_df[['seconds_in_bucket']].values.flatten() * k / 2), # 13, 14
          trn_df[['nna']].values.flatten(), # 15
          (trn_df[['far_price']].values.flatten() == 0) * 1.0, # 16
          (trn_df[['near_price']].values.flatten() == 0) * 1.0 ] # 17
         ).T 
X.shape

In [None]:
mX = np.mean(X[:, 1:11], axis=0)
sX = np.std(X[:, 1:11], axis=0)
X[:, 1:11] = (X[:, 1:11] - mX) / sX



In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Ridge Regression
ivars1 = [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17]
ivars2 = [1, 2, 3, 4, 5, 8]
ivars3 = [1, 2, 3, 4, 5, 8, 9, 10]
ivars_int0 = [1, 4, 10, 11, 12]
ivars_int1 = [1, 4, 5, 10]

alphas = [0.01, 0.1, 1, 10, 100]  # Example values, adjust as needed

best_alpha = None
best_mae = float('inf')

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    
    yhat_test_ridge = ridge_model.predict(X_test)
    mae = mean_absolute_error(y_test, yhat_test_ridge)
    
    if mae < best_mae:
        best_mae = mae
        best_alpha = alpha

print("Best Alpha:", best_alpha)
print("Best MAE:", best_mae)

best_alpha_index = alphas.index(best_alpha)
alpha_range = [max(0, best_alpha + i * best_alpha / 3) for i in range(-5, 6)]
print(alpha_range)

best_mae = float('inf')

for alpha in alpha_range:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    
    yhat_test_ridge = ridge_model.predict(X_test)
    mae = mean_absolute_error(y_test, yhat_test_ridge)
    
    if mae < best_mae:
        best_mae = mae
        best_alpha = alpha
print("Best Alpha close-search:", best_alpha)
# Retrain Ridge Regression with optimal alpha
ridge_model = Ridge(alpha=1)
ridge_model.fit(X_train, y_train)

In [None]:
yhat_train_ridge = ridge_model.predict(X_train)
yhat_test_ridge = ridge_model.predict(X_test)

# Evaluate Ridge Regression model
ridge_train_evaluation_result = mean_absolute_error(y_train, yhat_train_ridge)
ridge_test_evaluation_result = mean_absolute_error(y_test, yhat_test_ridge)
print("Ridge Regression Train Evaluation Result:", ridge_train_evaluation_result)
print("Ridge Regression Test Evaluation Result:", ridge_test_evaluation_result)

In [None]:
def predict(test, revealed_targets, sample_prediction, ridge_model, mX, sX, qlim, pred_with_8nas, values):
    test['nna'] = test.isnull().sum(axis=1).values
    test = test.fillna(value=values)

    k = 2 * np.pi * 4 / 540
    X_test = np.array([np.ones((test.shape[0])), test[['wap']].values.flatten(), # 0, 1
          test[['ask_size']].values.flatten(), test[['bid_size']].values.flatten(), # 2, 3
          test[['ask_price']].values.flatten(), test[['bid_price']].values.flatten(), # 4, 5
          test[['near_price']].values.flatten(), test[['far_price']].values.flatten(), # 6, 7
          test[['matched_size']].values.flatten(), test[['reference_price']].values.flatten(), # 8, 9
          test[['imbalance_size']].values.flatten() * test[['imbalance_buy_sell_flag']].values.flatten(), # 10
          np.cos(test[['seconds_in_bucket']].values.flatten() * k), np.sin(test[['seconds_in_bucket']].values.flatten() * k), # 11, 12
          np.cos(test[['seconds_in_bucket']].values.flatten() * k / 2), np.sin(test[['seconds_in_bucket']].values.flatten() * k / 2), # 13, 14
          test[['nna']].values.flatten(), # 15
          (test[['far_price']].values.flatten() == 0) * 1.0, # 16
          (test[['near_price']].values.flatten() == 0) * 1.0 ] # 17
         ).T 
    X_test[:, 1:11] = (X_test[:, 1:11] - mX) / sX

    # Predict using the Ridge model
    yhat_test = ridge_model.predict(X_test)
    yhat_test = np.minimum(np.maximum(yhat_test, qlim[0]), qlim[1])
    yhat_test[test[['nna']].values.flatten() == 8] = pred_with_8nas
    yhat_test[np.isnan(yhat_test)] = 0
    return yhat_test



In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    if counter == 0:
        print(test.head(3))
        print(revealed_targets.head(3))
        print(sample_prediction.head(3))
      
    sample_prediction['target'] = predict(test, revealed_targets, sample_prediction, ridge_model, mX, sX, qlim, pred_with_8nas, values)
    
    print(counter, sample_prediction.head(3))
    env.predict(sample_prediction)
    counter += 1