In [32]:
import pandas as pd

df = pd.read_csv('./data/data2019.csv')

In [33]:
NEGATIVE_IMBALANCE_PENALTY = 1.03
POSITIVE_IMBALANCE_PENALTY = 0.97


def calculate_negative_imbalance_price(smp, mcp):
    return max(smp, mcp) * NEGATIVE_IMBALANCE_PENALTY


def calculate_positive_imbalance_price(smp, mcp):
    return min(smp, mcp) * POSITIVE_IMBALANCE_PENALTY

In [34]:
df.dropna(inplace=True)

In [35]:
df["smpUsd"] = df["smp"] / df["exchangeRate"]
df["idmUsd"] = df["idm"] / df["exchangeRate"]

In [36]:
df["positive_imbalance_price"] = df.apply(
    lambda row: calculate_positive_imbalance_price(row["smpUsd"], row["mcpUsd"]), axis=1
)
df["negative_imbalance_price"] = df.apply(
    lambda row: calculate_negative_imbalance_price(row["smpUsd"], row["mcpUsd"]), axis=1
)

In [37]:
def calculate_shortselling_profit(idm, negative_imbalance_price):
    return max(idm - negative_imbalance_price, 0)


def calculate_long_profit(idm, positive_imbalance_price):
    return max(positive_imbalance_price - idm, 0)

In [38]:
def calculate_shortselling_loss(idm, negative_imbalance_price):
    return max(negative_imbalance_price - idm, 0)

def calculate_long_loss(idm, positive_imbalance_price):
    return max(idm - positive_imbalance_price, 0)

In [39]:
df["shortselling_profit"] = df.apply(
    lambda row: calculate_shortselling_profit(
        row["idmUsd"], row["negative_imbalance_price"]
    ),
    axis=1,
)
df["long_profit"] = df.apply(
    lambda row: calculate_long_profit(row["idmUsd"], row["positive_imbalance_price"]),
    axis=1,
)

df["total_profit"] = df["shortselling_profit"] + df["long_profit"]

In [40]:
df["shortselling_loss"] = df.apply(
    lambda row: calculate_shortselling_loss(
        row["idmUsd"], row["negative_imbalance_price"]
    ),
    axis=1,
)

df["long_loss"] = df.apply(
    lambda row: calculate_long_loss(row["idmUsd"], row["positive_imbalance_price"]),
    axis=1,
)

df["total_loss"] = df["shortselling_loss"] + df["long_loss"]

In [41]:
df['is_shortselling_profitable'] = df['shortselling_profit'] > 0
df['is_long_profitable'] = df['long_profit'] > 0

In [42]:
# Calculate short selling opportunities count
shortselling_opportunities_count = df["is_shortselling_profitable"].sum()
long_opportunities_count = df["is_long_profitable"].sum()
print(f"Short selling opportunities count: {shortselling_opportunities_count}")
print(f"Long opportunities count: {long_opportunities_count}")
print(f"Total opportunities count: {shortselling_opportunities_count + long_opportunities_count}")

#print ratio of profitable short selling opportunities
print(f"Ratio of profitable short selling opportunities: {shortselling_opportunities_count / len(df)}")
print(f"Ratio of profitable long opportunities: {long_opportunities_count /len(df)}")
print(f"Ratio of profitable opportunities: {(shortselling_opportunities_count + long_opportunities_count) / len(df)}")

Short selling opportunities count: 372
Long opportunities count: 2043
Total opportunities count: 2415
Ratio of profitable short selling opportunities: 0.042562929061784896
Ratio of profitable long opportunities: 0.23375286041189933
Ratio of profitable opportunities: 0.27631578947368424


In [43]:
df['dayOfWeek'] = pd.to_datetime(df['date']).dt.dayofweek

In [44]:
X = df[['mcpUsd','netImbalanceVolume', 'idmUsd', 'hour', 'dayOfWeek']]
y = df['is_long_profitable']

In [45]:
X = pd.get_dummies(X, columns=['hour', 'dayOfWeek'], drop_first=True)

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
X_train_norm = (X_train - X_train.mean()) / X_train.std()
X_test_norm = (X_test - X_train.mean()) / X_train.std()

In [47]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score


# Classifiers
classifiers = [
    KNeighborsClassifier(30),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    LogisticRegression(),
    GaussianNB(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    MLPClassifier(),
]


k = 5
preds = pd.DataFrame(index=[*range(k)])

for cls in classifiers:
    scores = cross_val_score(cls, X_train, y_train, cv=k, scoring="accuracy")
    preds[type(cls).__name__] = scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [48]:
print(preds.mean())

KNeighborsClassifier          0.767879
DecisionTreeClassifier        0.934068
RandomForestClassifier        0.901461
AdaBoostClassifier            0.883154
LogisticRegression            0.878719
GaussianNB                    0.650604
GradientBoostingClassifier    0.939501
MLPClassifier                 0.946365
dtype: float64


In [49]:
from autogluon.tabular import TabularDataset, TabularPredictor

train_data = TabularDataset(X_train.join(y_train))

In [50]:
predictor = TabularPredictor(label='is_long_profitable', eval_metric='f1', log_file_path='./autogluonTabular', path='./AutogluonModels/HourAndDayMediumUsd').fit(train_data, presets='medium_quality')

Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "./AutogluonModels/HourAndDayMediumUsd"
AutoGluon Version:  1.1.0
Python Version:     3.10.13
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.1.0: Mon Oct  9 21:28:31 PDT 2023; root:xnu-10002.41.9~6/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       7.38 GB / 16.00 GB (46.2%)
Disk Space Avail:   144.54 GB / 460.43 GB (31.4%)
Train Data Rows:    6992
Train Data Columns: 32
Label Column:       is_long_profitable
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [False, True]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mappin

[1000]	valid_set's binary_logloss: 0.193667	valid_set's f1: 0.821656


	0.8269	 = Validation score   (f1)
	5.97s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBM ...
	0.9028	 = Validation score   (f1)
	2.15s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.7785	 = Validation score   (f1)
	0.56s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.7534	 = Validation score   (f1)
	0.57s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: CatBoost ...
	0.8834	 = Validation score   (f1)
	4.15s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.512	 = Validation score   (f1)
	0.53s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.5259	 = Validation score   (f1)
	0.56s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.9216	 = Validation score   (f1)
	6.93s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	

In [53]:
predictor = TabularPredictor.load('./AutogluonModels/HourAndDayMediumUsd')

In [54]:
test_data = TabularDataset(X_test.join(y_test))

In [55]:
y_pred = predictor.predict(test_data)
y_pred_proba = predictor.predict_proba(test_data)

In [56]:
predictor.evaluate(test_data)

{'f1': 0.9301075268817204,
 'accuracy': 0.9702517162471396,
 'balanced_accuracy': 0.9520888902673532,
 'mcc': 0.9112999700331502,
 'roc_auc': 0.9916839836238447,
 'precision': 0.9402173913043478,
 'recall': 0.9202127659574468}

In [57]:
confusion_matrix(y_test, y_pred)

array([[1350,   22],
       [  30,  346]])

In [397]:
threshold = 0.9
m_y_pred = y_pred_proba.iloc[:, 1] > threshold
m_y_pred = m_y_pred.astype(int)
confusion_matrix(y_test, m_y_pred)

array([[1367,    5],
       [ 308,   68]])

In [58]:
from sklearn.metrics import precision_score
thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
for threshold in thresholds:
    m_y_pred = y_pred_proba.iloc[:, 1] > threshold
    m_y_pred = m_y_pred.astype(int)
    print(f"Threshold: {threshold}", "Precision:", precision_score(y_test, m_y_pred, zero_division=0))
    print("Predicted positives ratio:", (m_y_pred * y_test).sum() / y_test.sum())
    print("-----------------")


Threshold: 0.5 Precision: 0.9402173913043478
Predicted positives ratio: 0.9202127659574468
-----------------
Threshold: 0.55 Precision: 0.9495798319327731
Predicted positives ratio: 0.901595744680851
-----------------
Threshold: 0.6 Precision: 0.953757225433526
Predicted positives ratio: 0.8776595744680851
-----------------
Threshold: 0.65 Precision: 0.9618768328445748
Predicted positives ratio: 0.8723404255319149
-----------------
Threshold: 0.7 Precision: 0.9727272727272728
Predicted positives ratio: 0.8537234042553191
-----------------
Threshold: 0.75 Precision: 0.9782608695652174
Predicted positives ratio: 0.8377659574468085
-----------------
Threshold: 0.8 Precision: 0.9801324503311258
Predicted positives ratio: 0.7872340425531915
-----------------
Threshold: 0.85 Precision: 0.9893238434163701
Predicted positives ratio: 0.7393617021276596
-----------------
Threshold: 0.9 Precision: 1.0
Predicted positives ratio: 0.699468085106383
-----------------
Threshold: 0.95 Precision: 1.0
Pr

In [59]:
POSITIVE_IMBALANCE_PENALTY = 0.97
NEGATIVE_IMBALANCE_PENALTY = 1.03
def calculate_pnl(mcp, smp, idm, strategy):
    positive_imbalance_price = min(mcp, smp) * POSITIVE_IMBALANCE_PENALTY
    negative_imbalance_price = max(mcp, smp) * NEGATIVE_IMBALANCE_PENALTY
    if strategy == 'buy':
        return positive_imbalance_price - idm
    elif strategy == 'sell':
        return idm - negative_imbalance_price
    else:
        return 0


In [250]:
# predictor = TabularPredictor.load("AutogluonModels/ag-20240515_155843") # Time consumed model

In [215]:
# predictor = TabularPredictor.load("AutogluonModels/ag-20240515_153257") # One later model

In [26]:
predictor.evaluate(test_data)

{'f1': 0.7710219922380336,
 'accuracy': 0.898741418764302,
 'balanced_accuracy': 0.8601978785435147,
 'mcc': 0.7065114369226215,
 'roc_auc': 0.9473260808882822,
 'precision': 0.7506297229219143,
 'recall': 0.7925531914893617}

In [61]:
y_pred = predictor.predict(test_data)
y_pred_proba = predictor.predict_proba(test_data)
thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
for threshold in thresholds:
    m_y_pred = y_pred_proba.iloc[:, 1] > threshold
    m_y_pred = m_y_pred.astype(int)
    print(
        f"Threshold: {threshold}",
        "Precision:",
        precision_score(y_test, m_y_pred, zero_division=0),
    )
    print("Predicted positives ratio:", (m_y_pred * y_test).sum() / y_test.sum())
    print("-----------------")

Threshold: 0.5 Precision: 0.9402173913043478
Predicted positives ratio: 0.9202127659574468
-----------------
Threshold: 0.55 Precision: 0.9495798319327731
Predicted positives ratio: 0.901595744680851
-----------------
Threshold: 0.6 Precision: 0.953757225433526
Predicted positives ratio: 0.8776595744680851
-----------------
Threshold: 0.65 Precision: 0.9618768328445748
Predicted positives ratio: 0.8723404255319149
-----------------
Threshold: 0.7 Precision: 0.9727272727272728
Predicted positives ratio: 0.8537234042553191
-----------------
Threshold: 0.75 Precision: 0.9782608695652174
Predicted positives ratio: 0.8377659574468085
-----------------
Threshold: 0.8 Precision: 0.9801324503311258
Predicted positives ratio: 0.7872340425531915
-----------------
Threshold: 0.85 Precision: 0.9893238434163701
Predicted positives ratio: 0.7393617021276596
-----------------
Threshold: 0.9 Precision: 1.0
Predicted positives ratio: 0.699468085106383
-----------------
Threshold: 0.95 Precision: 1.0
Pr

In [60]:
# Get X_test data from df using indexes
X_test_df = df.loc[X_test.index]

thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
y_pred = predictor.predict(test_data)
y_pred_proba = predictor.predict_proba(test_data)
for threshold in thresholds:
    m_y_pred = (y_pred_proba.iloc[:, 1] > threshold).astype(int)
    X_test_df["strategy"] = m_y_pred.apply(lambda x: 'buy' if x == 1  else 'hold')
    X_test_df["pnl"] = X_test_df.apply(
        lambda row: calculate_pnl(row["mcpUsd"], row["smpUsd"], row["idmUsd"], row["strategy"]),
        axis=1,
    )
    print(f"Threshold: {threshold}", "Total PnL:", X_test_df["pnl"].sum(), "Profitable Trades ratio: ", (X_test_df["pnl"] > 0).sum() / (X_test_df['strategy'] == 'buy').sum())

Threshold: 0.5 Total PnL: 703.5624014565788 Profitable Trades ratio:  0.9402173913043478
Threshold: 0.55 Total PnL: 712.7349977760935 Profitable Trades ratio:  0.9495798319327731
Threshold: 0.6 Total PnL: 718.0453762828289 Profitable Trades ratio:  0.953757225433526
Threshold: 0.65 Total PnL: 727.5013000567833 Profitable Trades ratio:  0.9618768328445748
Threshold: 0.7 Total PnL: 739.3731339854041 Profitable Trades ratio:  0.9727272727272728
Threshold: 0.75 Total PnL: 738.0336719387449 Profitable Trades ratio:  0.9782608695652174
Threshold: 0.8 Total PnL: 731.4232470049888 Profitable Trades ratio:  0.9801324503311258
Threshold: 0.85 Total PnL: 730.6017686395046 Profitable Trades ratio:  0.9893238434163701
Threshold: 0.9 Total PnL: 727.4249866944967 Profitable Trades ratio:  1.0
Threshold: 0.95 Total PnL: 680.2666255992062 Profitable Trades ratio:  1.0


In [254]:
threshold = 0.8
m_y_pred = (y_pred_proba.iloc[:, 1] > threshold).astype(int)
X_test_df["strategy"] = m_y_pred.apply(lambda x: 'buy' if x == 1  else 'hold')
X_test_df["pnl"] = X_test_df.apply(
    lambda row: calculate_pnl(row["mcpUsd"], row["smpUsd"], row["idmUsd"], row["strategy"]),
    axis=1,
)