<a href="https://colab.research.google.com/github/JurairatRod/DADS6003_trading-model/blob/main/DADS6003_trading_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 : Find stock ratio

In [None]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Fetch historical data from Yahoo Finance
# symbols = ['TSLA', 'GOOGL', 'MSFT']
symbols = ['ERW.BK', 'TISCO.BK', 'SPRC.BK']
start_date = '2013-01-01'
end_date = '2023-10-31'
def find_returns_rate(symbol,start_date,end_date) :
    data = yf.download(symbol, start=start_date, end=end_date)

    # Feature engineering: Adding additional features like lag values
    data['Lag1'] = data['Close'].shift(1)  # Lagged value of Close

    # Drop rows with missing values due to lag
    data = data.dropna()

    # Split the data into training and testing sets
    X = data[['Lag1']]
    y = data['Close']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the Linear Regression model
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = lr_model.predict(X_test)

    # Get the latest available data to predict future prices
    latest_data = data.iloc[-1]
    latest_data_input = np.array([[latest_data['Close']]])

    # Predict the future price using linear regression
    future_price = lr_model.predict(latest_data_input)[0]
    # Calculate the return on investment
    initial_investment = latest_data['Close']
    return_on_investment = (future_price - initial_investment) / initial_investment * 100

    if return_on_investment < 0:
        return_on_investment = 0

    return return_on_investment


In [None]:
rate = {}
for i in symbols:
    rate[i] = find_returns_rate(i,start_date,end_date)
ratio = {}
for k in rate.keys():
    ratio[k] = rate[k]/sum(rate.values())

print(ratio)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
{'ERW.BK': 0.0, 'TISCO.BK': 0.18126090168863104, 'SPRC.BK': 0.8187390983113689}


# 2 : Prediction models

## import libraries and Data

In [None]:
import yfinance as yf
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Create a stratified 10-fold cross-validator
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define models and hyperparameter grids for grid search
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier()
}

param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20, 30]}
}


## define function

In [None]:
##### get data #####
def get_data(symbol,start_date,end_date):
    data = yf.download(symbol, start=start_date, end=end_date)

    # Feature engineering: Adding additional features like lag values
    data['Lag1'] = data['Close'].shift(1)  # Lagged value of Close
    data = data.dropna()

    # Create binary labels for the trading signal (1: Buy, -1: Sell, 0: Hold)
    data['Signal'] = 0
    data.loc[data['Close'] > data['Lag1'], 'Signal'] = 1 #Buy
    data.loc[data['Close'] < data['Lag1'], 'Signal'] = -1 #Sell

    return data

## train and find the best model for each stock

In [None]:
# set trained dataset
start_date='2023-01-01'
end_date='2023-10-31'

# set predicted dataset
start_date_test='2023-10-31' # 1 day before
end_date_test='2024-01-01' # 1 day after

# total budget
budget = 10000

In [None]:
after_budget = []
for i in symbols:
    ################ Get Data ################
    data = yf.download(i, start=start_date, end=end_date)

    # Feature engineering: Adding additional features like lag values
    data['Lag1'] = data['Close'].shift(1)  # Lagged value of Close
    data = data.dropna()

    # Create binary labels for the trading signal (1: Buy, -1: Sell, 0: Hold)
    data['Signal'] = 0
    data.loc[data['Close'] > data['Lag1'], 'Signal'] = 1
    data.loc[data['Close'] < data['Lag1'], 'Signal'] = -1

    # Define features and labels
    X = data[['Lag1']]
    y = data['Signal']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=7)

    ################ Train model ################
    results = {}
    for model_name, model in models.items():

        grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=cv, scoring='accuracy')

        # Fit the grid search on the training data
        grid_search.fit(X_train, y_train)

        # Perform cross-validation using the best estimator found by grid search
        y_pred_cv = cross_val_predict(grid_search.best_estimator_, X_test, y_test, cv=cv)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred_cv)

        results[model_name] = {'best_params': grid_search.best_params_, 'accuracy': accuracy}

    # Select the best model based on accuracy
    best_model_name = max(results, key=lambda key: results[key]['accuracy'])
    best_model = models[best_model_name]

    print(f"################ BEST MODEL for {i} stock ################")
    print(f"\nBest Model: {best_model_name}")
    print(f"\nBest parameter: {results[best_model_name]['best_params']}")
    print(f"\nAccuracy: {results[best_model_name]['accuracy']}")

    ################ Train and Fit ################
    # Train the best model on the entire dataset
    best_model.fit(X, y)

    # Predict the trading signals for the specified date range
    prediction_data = yf.download(i, start=start_date_test, end=end_date_test)
    prediction_data['Lag1'] = prediction_data['Close'].shift(1)
    prediction_data = prediction_data.dropna()
    prediction_signals = best_model.predict(prediction_data[['Lag1']])

    # Create a DataFrame with the trading signals
    trading_signals = prediction_data[['Close']]
    trading_signals['Signal'] = prediction_signals
    trading_signals['Lag1'] = trading_signals['Signal'].shift(1).fillna(9) #signal of the day before
    trading_signals['Final_signal'] = trading_signals['Signal'] + trading_signals['Lag1'] #combine of previous day signal and today signal
    # test.head(50)

    # Calculate the investment timeline and remaining money
    initial_investment = budget*ratio[i]
    remaining_money = initial_investment
    stocks = 0
    investment_timeline = []
    sig = ''
    print(trading_signals)

    print(f"\nInitial investment: {initial_investment}")

    if initial_investment == 0:
        print(f"\nWe do not invesment {i}")
    else:
        for date, signal in trading_signals.iterrows():
            if signal['Signal'] == 1 and (signal['Final_signal'] <= 1 or signal['Final_signal'] == 10) :  # Buy
                sig = 'Buy'
                stocks = remaining_money/signal['Close']
                remaining_money = stocks*signal['Close']
            elif (signal['Signal'] == 0) or (signal['Signal'] == 1 and signal['Final_signal'] >= 1) :  # Hold
                sig = 'Hold'
                if stocks != 0 :
                  remaining_money = stocks*signal['Close']
            elif signal['Signal'] == -1:  # Sell
                sig = 'Sell'
                stocks = 0
                remaining_money = remaining_money
            investment_timeline.append((date, sig, remaining_money,stocks,stocks*signal['Close']))

        # investment_timeline
        # Print the investment timeline and remaining money
        print("Investment Daily Timeline:")
        for entry in investment_timeline:
            print(f"{entry[0]} - {entry[1]} - Remaining Money: {entry[2]:.2f} - No. of Stocks: {entry[3]} - Value: {entry[4]}")

        print(f"\nRemaining Money on 17th November 2023: {remaining_money:.2f}")
    after_budget.append(remaining_money)

[*********************100%%**********************]  1 of 1 completed
################ BEST MODEL for ERW.BK stock ################

Best Model: Logistic Regression

Best parameter: {'C': 100}

Accuracy: 0.43902439024390244
[*********************100%%**********************]  1 of 1 completed
            Close  Signal  Lag1  Final_signal
Date                                         
2023-11-01   5.15      -1   9.0           8.0
2023-11-02   5.20      -1  -1.0          -2.0
2023-11-03   5.15      -1  -1.0          -2.0
2023-11-06   5.20      -1  -1.0          -2.0
2023-11-07   5.25      -1  -1.0          -2.0
2023-11-08   5.20      -1  -1.0          -2.0
2023-11-09   5.10      -1  -1.0          -2.0
2023-11-10   5.00      -1  -1.0          -2.0
2023-11-13   5.10      -1  -1.0          -2.0
2023-11-14   5.15      -1  -1.0          -2.0
2023-11-15   5.30      -1  -1.0          -2.0
2023-11-16   5.40      -1  -1.0          -2.0
2023-11-17   5.45      -1  -1.0          -2.0
2023-11-20   5.50 

In [None]:
if sum(after_budget)-budget >0:
    print(f"From our budget {budget:,.0f} we gain {sum(after_budget)-budget:,.2f}")
elif sum(after_budget)-budget == 0:
    print(f"From our budget {budget:,.0f} we do not gain or loss anything.")
elif sum(after_budget)-budget <0:
    print(f"From our budget {budget:,.0f} we loss {sum(after_budget)-budget:,.2f}")

From our budget 10,000 we gain 1,246.04
