In [None]:
from Models import ML_Models
exam_models = ML_Models()

In [None]:
import yfinance as yf
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Parameters used for fetching the data from yfinance.
start_date = "2012-01-01"
end_date = dt.date.today()
main_col = "Adj Close"
interval = "1d"
stocks_list = ["EQNR.OL", "DNB.OL", "TEL.OL", "NHY.OL", "AKRBP.OL", "YAR.OL", "MOWI.OL", "BZ=F", "OSEBX.OL"]

# Specifying the indicators wanted for further analysis.
indicators = ["MA5", "MA20", "MA50", "MA200", "MIN", "MAX", "LOG_RET", "MOM", "VOLA", "DIFF"]

# Models to utilize for forecasting/prediction. 
# Name of the models are based on the pick_model function in Models.py
models = ["LR", "DTR", "MLP", "XGBoost", "XGBoost_LR", "ADA", "ADA_LR", "GBR", "Bagging", "Bagging_LR", "Bagging_MLP", "StackedRegressor"]

# Metrics used to evaluate the performance of each model.
# MAE, MSE, RMSE and MAPE are named with "neg_" to be recognized by the cross_validate function from scikit-learn.
metric_names = ["r2", "neg_mean_absolute_error", "neg_mean_squared_error", "neg_root_mean_squared_error", "neg_mean_absolute_percentage_error"]
pretty_metric_names = {"r2":"R^2: ", "neg_mean_absolute_error":"MAE: ", "neg_mean_squared_error":"MSE: ","neg_root_mean_squared_error":"RMSE: ", "neg_mean_absolute_percentage_error":"MAPE: "}

# Collecting data from Yahoo Finance

In [None]:
stock_data = {}
for ticker in stocks_list:
    print(f"Downloading {ticker} data")
    # fetch stock data from yahoo finance
    raw_data = yf.download(ticker, start=start_date, end=end_date, interval=interval)
    stock_data[ticker] = raw_data

print("All the data is now downloaded!")

# Save fetched data to csv
#for ticker in stocks_list:
#    stock_data[ticker].to_csv("raw_data/data_"+ticker+".csv")

# Preproccesing the data

In [None]:
def add_indicator_columns(data, indicators):
    # Creating label and shifting the selected main_col value by 1.
    label_name = "Label"
    data[label_name] = data[main_col].shift(periods=1)

    # Checking which of the different indicators that should be added as a column (based on input from indicators list).
    if "MA5" in indicators:
        data["MA5"] = data[label_name].rolling(5).mean()
    if "MA20" in indicators:
        data["MA20"] = data[label_name].rolling(20).mean()
    if "MA50" in indicators:
        data["MA50"] = data[label_name].rolling(50).mean()
    if "MA200" in indicators:
        data["MA200"] = data[label_name].rolling(200).mean()
    if "MIN" in indicators:
        data["MIN"] = data[label_name].rolling(20).min()
    if "MAX" in indicators:
        data["MAX"] = data[label_name].rolling(20).max()
    log_ret = np.log(data[label_name] / data[label_name].shift(1))
    if "LOG_RET" in indicators:
        data["LOG_RET"] = log_ret
    if "MOM" in indicators:
        data["MOM"] = log_ret.rolling(20).mean()
    if "VOLA" in indicators:
        data["VOLA"] = log_ret.rolling(20).std()
    if "DIFF" in indicators:
        data["DIFF"] = data[label_name] - data[label_name].shift(1)

    # remove empty vals.
    data.dropna(axis=0, inplace=True)

In [None]:
def create_X_y_arrays(data, label_name):
        # array that contains the indicators data.
        X = data.loc[:, indicators].to_numpy()
        # array with the target data (based on main_col).
        y = data[label_name].to_numpy()
        return X, y

In [None]:
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

def create_X_y_train_test_split(X, y, current_stock):
    data = stock_data[current_stock]

    for train_index, test_index in tscv.split(data):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    return X_train, X_test, y_train, y_test

In [None]:
# Adding the specified indicators to the data.
for ticker, data in stock_data.items():
    add_indicator_columns(data=data, indicators=indicators)

# Training models

In [None]:
cv_results = {}
trained_models = {}

# THIS FUNCTION CAN HAVE HIGH RUNTIME BASED ON THE AMOUNT OF MODELS AND DATA.
# INPUT MUST BE A LIST.
def train_models(input_models):
        for ticker, data in stock_data.items():

            X, y = create_X_y_arrays(data=data, label_name="Label")

            X_train, X_test, y_train, y_test = create_X_y_train_test_split(X=X, y=y, current_stock=ticker)

            # Evaluating and training selected models.
            for model_i in input_models:
                model = exam_models.pick_model(model=model_i)
                metric_dict = {}
                for metric_name in metric_names:
                    metric_dict[metric_name] = metric_name

                # using method from sci-kit lib to cross-validate
                cross_val_results = cross_validate(
                    model,
                    X,
                    y,
                    cv=tscv,
                    scoring=metric_dict,
                    return_train_score=True,
                    n_jobs=-1,
                    verbose=0  
                )
                model.fit(X_train, y_train)
                cv_results[ticker+"_model_"+model_i] = cross_val_results 
                trained_models["trained_model_"+model_i+"_"+ticker] = model

        return cv_results

In [None]:
cv_stocks_models = train_models(input_models=models)

# Predicting values based on trained models 

In [None]:
stock_predictions = {}
def predict_trained_models(input_models):
    for ticker, data in stock_data.items():

        # Creating X and y arrays for train and test sets.
        X, y = create_X_y_arrays(data=data, label_name= "Label")

        X_train, X_test, y_train, y_test = create_X_y_train_test_split(X=X, y=y, current_stock=ticker)

        last_train_index, last_test_index = None, None

        for train_index, test_index in tscv.split(data):
            last_train_index, last_test_index = train_index, test_index

        prediction = data.loc[data.index[last_test_index], [main_col]].copy(deep=True)
        stock_predictions[ticker] = prediction

        for model_i in input_models:
            model = trained_models["trained_model_"+model_i+"_"+ticker]
            y_pred = model.predict(X_test)
            prediction.loc[:, model_i+" Prediction"] = y_pred

    return stock_predictions

In [None]:
predicted_stock_data = predict_trained_models(input_models=models)

In [None]:
for ticker in stocks_list:
    print(ticker)
    display(predicted_stock_data[ticker])

In [None]:
def print_calculated_metrics():
    metrics_df_output_new = {}

    for ticker in stocks_list:
        print(ticker)

        metrics_df = pd.DataFrame(index=list(pretty_metric_names.values()))
        metrics_df_output_new[ticker] = metrics_df

        for model_name in models:
            res = cv_results[ticker+"_model_"+model_name]

            print("Model: "+model_name+"\n"+"Training scores for data split 1 - 5:")

            train_splits_error = ""

            for i, metric in enumerate(metric_names):
                error_value = res["train"+"_"+metric]

                if metric.startswith("neg"):
                    # Negate the value.
                    error_value = -error_value

                train_error_metrics = ", ".join([f"{x:.4f}" for x in error_value.tolist()]) 

                train_splits_error += f"{pretty_metric_names[metric]}: {train_error_metrics}\n"

                metrics_df.loc[pretty_metric_names[metric], model_name+" Model "+"Train"] = np.mean(error_value)

            print(train_splits_error+"\n"+"Testing scores for data split 1 - 5:")

            test_splits_error = ""

            for i, metric in enumerate(metric_names):
                error_value = res["test"+"_"+metric]

                if metric.startswith("neg"):
                    # Negate the value.
                    error_value = -error_value

                test_error_metrics = ", ".join([f"{x:.4f}" for x in error_value.tolist()]) 

                test_splits_error += f"{pretty_metric_names[metric]}: {test_error_metrics}\n"

                metrics_df.loc[pretty_metric_names[metric], model_name+" Model "+"Test"] = np.mean(error_value)

            print(test_splits_error+"\n")
        
        print("-"*50+"\n")

    return metrics_df_output_new

In [None]:
metrics_output = print_calculated_metrics()

# Plots for insights about the data

In [None]:
# PLOT FOR SHOWING DIFFERENT DATA SPLITS FOR THE DIFFERENT STOCKS
for ticker in stocks_list:
    fig, sub_plots = plt.subplots(n_splits, figsize=(16,20))
    fig.set_tight_layout(True)

    data = stock_data[ticker]
    idx = data.index

    tscv = TimeSeriesSplit(n_splits=n_splits)

    splits = list(tscv.split(data))
        
    current_split = 1
    for i in range(len(sub_plots)):
        train_index, test_index = splits[i]

        sub_plots[i].plot(idx[train_index], data.loc[idx[train_index], main_col], label=f"Training data {current_split}", color="blue")
        sub_plots[i].plot(idx[test_index], data.loc[idx[test_index], main_col], label=f"Test data {current_split}", color="red")
        sub_plots[i].set_xlim(idx[0], idx[-1])
        sub_plots[i].set_title(f"Train / test split {current_split} for {ticker}")
        sub_plots[i].set_xlabel("Date")
        sub_plots[i].set_ylabel(f"{main_col}")
        sub_plots[i].legend()

        current_split = current_split + 1
    
    #fig.savefig("data_splits_plots/"+ticker+".png")

fig.show()

In [None]:
# PLOT FOR COMPARING ACTUAL TO THE DIFFERENT MODELS PREDICTED VALUES
# Created a new list excluding XGBoost, ADA, GBR and Bagging (These are based on DTR).
models_for_plot = ["LR", "DTR", "MLP", "XGBoost_LR", "ADA_LR", "Bagging_LR", "Bagging_MLP", "StackedRegressor"]
for ticker, data in stock_data.items():
    figure, axs = plt.subplots(figsize=(32,16))
    
    X, y = create_X_y_arrays(data=data, label_name="Label")
    X_train, X_test, y_train, y_test = create_X_y_train_test_split(X=X, y=y, current_stock=ticker)

    X_test_index = np.arange(X_train.shape[0], X_train.shape[0]+X_test.shape[0])

    plt.plot(data.index[X_test_index], y_test, color='purple', label='Actual', linewidth="4.0")
    for model in models_for_plot:
        plt.plot(data.index[X_test_index], stock_predictions[ticker][model+" Prediction"], label=model)
    plt.title(f'Actual vs Predicted, {ticker}')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()

    #plt.savefig("actual_predicted_plots/"+ticker+".png")
    plt.show()

In [None]:
# CORRELATION PLOT BETWEEN THE DIFFERENT STOCKS
adj_close_prices = pd.DataFrame({i: j[main_col] for i, j in stock_data.items()})

corr = adj_close_prices.corr()

sns.heatmap(corr, cmap="coolwarm", annot=True)

plt.title("Correlation between stocks")

#plt.savefig("correlation_plots/correlation_stocks.png")
plt.show()

# Saving the data

In [None]:
#for ticker in stocks_list:
#    metrics_output[ticker].to_csv("saved_metrics/stock_"+ticker+".csv")
#    stock_data[ticker].to_pickle("saved_data_pickle/stock_"+ticker+".pkl")
#    predicted_stock_data[ticker].to_csv("saved_predictions/predicted_data_"+ticker+".csv")

# Grid search to try and find optimal parameters

In [None]:
from sklearn.model_selection import GridSearchCV

for ticker, data in stock_data.items():
    if ticker == "DNB.OL": # Manually switch ticker
        X, y = create_X_y_arrays(data=data, label_name= "Label")
        X_train, X_test, y_train, y_test = create_X_y_train_test_split(X=X, y=y, current_stock=ticker)


param_grid = {
    'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    'max_depth': [1,2,3,4,5,6,7,8,9,10,15,20,50,75,100,250,500,750,1000],
    'splitter': ["best", "random"],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ["auto", "sqrt", "log2"],
    'random_state': [None, 42]
}

# TESTED MAINLY ON DTR AND ENSEMBLE METHODS.
grid_search = GridSearchCV(estimator=exam_models.DTR(), param_grid=param_grid, cv=tscv)

grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)