In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.31-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting PyYAML (from optuna)
  Downloading PyYAML-6.0.1-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)
  Downloading greenlet-3.0.3-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna)
  Downloading MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl.metadata (3.1 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
   ---------------------------------------- 0.0/380.1 kB ? eta -:--


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: C:\Users\student\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = []
directory = "stock_data_all/Finance/0"
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        print(f"Reading file: {filepath}")
        dt = pd.read_csv(filepath)
        data.append(dt)

# Concatenate all DataFrames in the list into a single DataFrame
data = pd.concat(data, ignore_index=True)


Reading file: stock_data_all/Finance/0\AGM.csv
Reading file: stock_data_all/Finance/0\APAM.csv
Reading file: stock_data_all/Finance/0\BANFP.csv
Reading file: stock_data_all/Finance/0\BFZ.csv
Reading file: stock_data_all/Finance/0\BLE.csv
Reading file: stock_data_all/Finance/0\BTCS.csv
Reading file: stock_data_all/Finance/0\BX.csv
Reading file: stock_data_all/Finance/0\BYM.csv
Reading file: stock_data_all/Finance/0\CNS.csv
Reading file: stock_data_all/Finance/0\CXE.csv
Reading file: stock_data_all/Finance/0\EQBK.csv
Reading file: stock_data_all/Finance/0\EVM.csv
Reading file: stock_data_all/Finance/0\EVN.csv
Reading file: stock_data_all/Finance/0\FMY.csv
Reading file: stock_data_all/Finance/0\FNB.csv
Reading file: stock_data_all/Finance/0\FNF.csv
Reading file: stock_data_all/Finance/0\GABC.csv
Reading file: stock_data_all/Finance/0\GECC.csv
Reading file: stock_data_all/Finance/0\GECCO.csv
Reading file: stock_data_all/Finance/0\GJO.csv
Reading file: stock_data_all/Finance/0\GTAC.csv
Read

In [5]:
if isinstance(data, pd.DataFrame):
    scaler = StandardScaler()
    data_ = scaler.fit_transform(data.iloc[:, 1:].values)
    ten_day_change = data["Close"].pct_change(periods=10) * 100
    data["10_day_change_fixed_discrete"] = pd.cut(
        ten_day_change, bins=[-float("inf"), -10, -2, 2, 10, float("inf")], labels=[0, 1, 2, 3, 4]
    )
else:
    print("The variable 'data' is not a DataFrame")


In [6]:
def split_data(stock, lookback):
    data_raw = np.array(stock)
    n_time = len(data_raw)
    data, targets = [], []
    for index in range(0, n_time - lookback, 10):
        data.append(data_raw[index : index + lookback, 1:-1])
        targets.append(stock["10_day_change_fixed_discrete"].iloc[index + lookback])

    data = np.array(data)
    targets = np.array(targets)
    print("Total data:", data.shape)

    x_train, x_test, y_train, y_test = train_test_split(
        data, targets, test_size=0.2, shuffle=True, random_state=42
    )

    return x_train, y_train, x_test, y_test

# Split data
lookback = 60
x_train, y_train, x_test, y_test = split_data(data, lookback)
print("x_train.shape =", x_train.shape)
print("y_train.shape =", y_train.shape)
print("x_test.shape =", x_test.shape)
print("y_test.shape =", y_test.shape)

x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)


Total data: (22770, 60, 71)
x_train.shape = (18216, 60, 71)
y_train.shape = (18216,)
x_test.shape = (4554, 60, 71)
y_test.shape = (4554,)


In [7]:
def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
    }
    model = RandomForestClassifier(random_state=42, **params)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
    }
    model = xgb.XGBClassifier(random_state=42, **params)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

def catboost_objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 500),
        'depth': trial.suggest_int('depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
    }
    model = CatBoostClassifier(random_state=42, verbose=0, **params)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

def mlp_objective(trial):
    params = {
        'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (100, 50), (200, 100)]),
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic']),
        'solver': trial.suggest_categorical('solver', ['adam', 'sgd', 'lbfgs']),
    }
    model = MLPClassifier(random_state=42, **params)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy


In [8]:
# Define a callback to stop optimization early if the performance stops improving
def early_stopping_callback(study, trial):
    if study.best_value - trial.value > 0.01:  # You can adjust the threshold
        return True

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(rf_objective, n_trials=100, callbacks=[early_stopping_callback])
best_rf = RandomForestClassifier(random_state=42, **study_rf.best_params)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(xgb_objective, n_trials=100, callbacks=[early_stopping_callback])
best_xgb = xgb.XGBClassifier(random_state=42, **study_xgb.best_params)

study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(catboost_objective, n_trials=100, callbacks=[early_stopping_callback])
best_catboost = CatBoostClassifier(random_state=42, verbose=0, **study_catboost.best_params)

study_mlp = optuna.create_study(direction='maximize')
study_mlp.optimize(mlp_objective, n_trials=100, callbacks=[early_stopping_callback])
best_mlp = MLPClassifier(random_state=42, **study_mlp.best_params)



[I 2024-07-09 18:39:52,871] A new study created in memory with name: no-name-0cc618d9-e337-4f04-a99b-07200b13bfac
[I 2024-07-09 18:44:13,721] Trial 0 finished with value: 0.7301273605621432 and parameters: {'n_estimators': 370, 'max_depth': 11, 'min_samples_split': 9}. Best is trial 0 with value: 0.7301273605621432.
[I 2024-07-09 18:46:31,265] Trial 1 finished with value: 0.7334211682037769 and parameters: {'n_estimators': 163, 'max_depth': 14, 'min_samples_split': 12}. Best is trial 1 with value: 0.7334211682037769.
[I 2024-07-09 18:55:10,642] Trial 2 finished with value: 0.7397891963109354 and parameters: {'n_estimators': 484, 'max_depth': 26, 'min_samples_split': 5}. Best is trial 2 with value: 0.7397891963109354.
[I 2024-07-09 18:59:23,871] Trial 3 finished with value: 0.7422046552481335 and parameters: {'n_estimators': 254, 'max_depth': 21, 'min_samples_split': 15}. Best is trial 3 with value: 0.7422046552481335.
[I 2024-07-09 19:01:26,071] Trial 4 finished with value: 0.734299516

In [None]:
# Initialize models with best parameters from Optuna
models = [
    ("RandomForestClassifier", best_rf),
    ("XGBoostClassifier", best_xgb),
    ("CatBoostClassifier", best_catboost),
    ("MLPClassifier", best_mlp)
]

# Initialize StackingClassifier
stacking_classifier = StackingClassifier(
    estimators=models,
    final_estimator=LogisticRegression()
)

# Train Stacking model
stacking_classifier.fit(x_train, y_train)

# Evaluate model
y_train_pred = stacking_classifier.predict(x_train)
y_test_pred = stacking_classifier.predict(x_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')


In [None]:
# Calculate confusion matrix
train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

# Plot confusion matrix
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(train_cm, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Train Confusion Matrix')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')

sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', ax=ax[1])
ax[1].set_title('Test Confusion Matrix')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()
