In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

2024-07-03 04:13:30.247834: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 04:13:30.247885: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 04:13:30.249351: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/kaggle/input/finpros/FPT.csv
/kaggle/input/finpros/VIC.csv
/kaggle/input/finpros/PNJ.csv
/kaggle/input/finpros/MSN.csv


In [2]:
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    data['Date/Time'] = pd.to_datetime(data['Date/Time'])
    data = data.sort_values('Date/Time')
    data['SMA_5'] = data['Close'].rolling(window=5).mean()
    data['EMA_5'] = data['Close'].ewm(span=5, adjust=False).mean()
    data['Price_Change'] = data['Close'].shift(-1) - data['Close']
    data = data.dropna()
    return data

In [3]:
file_paths = ['/kaggle/input/finpros/FPT.csv',
              '/kaggle/input/finpros/VIC.csv',
              '/kaggle/input/finpros/PNJ.csv',
              '/kaggle/input/finpros/MSN.csv']
ticker_data = {file_path.split('/')[-1].split('.')[0]: load_and_preprocess_data(file_path) for file_path in file_paths}

In [4]:
def prepare_data(df, features, target, test_size=0.2, random_state=42):
    X = df[features].values
    y = df[target].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    
    X_train = scaler_X.fit_transform(X_train)
    X_test = scaler_X.transform(X_test)
    
    y_train = scaler_y.fit_transform(y_train.reshape(-1, 1))
    y_test = scaler_y.transform(y_test.reshape(-1, 1))
    
    return X_train, X_test, y_train, y_test, scaler_y

In [5]:
def train_linear_regression(X_train, y_train, X_test, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

In [6]:
def train_random_forest(X_train, y_train, X_test, y_test, scaler_y):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train.ravel())
    
    y_pred = model.predict(X_test)
    y_pred = y_pred.reshape(-1, 1)
    y_pred = scaler_y.inverse_transform(y_pred)
    y_test_actual = scaler_y.inverse_transform(y_test)
    
    mse = mean_squared_error(y_test_actual, y_pred)
    
    return mse

In [7]:
# Tạo các sequences cho LSTM model
def create_sequences(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

In [8]:
def train_lstm(X_train, y_train, X_test, y_test, scaler_y, time_steps=10):
    X_train_seq, y_train_seq = create_sequences(X_train, y_train, time_steps)
    X_test_seq, y_test_seq = create_sequences(X_test, y_test, time_steps)
    
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(time_steps, X_train_seq.shape[2])))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
    
    y_pred_seq = model.predict(X_test_seq)
    y_pred = scaler_y.inverse_transform(y_pred_seq)
    y_test_actual = scaler_y.inverse_transform(y_test_seq)
    
    mse = mean_squared_error(y_test_actual, y_pred)
    
    return mse

In [9]:
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_5', 'EMA_5']
target = 'Price_Change'

linear_results = {}
random_forest_results = {}
lstm_results = {}

for ticker, df in ticker_data.items():
    X_train, X_test, y_train, y_test, scaler_y = prepare_data(df, features, target)
    
    linear_results[ticker] = train_linear_regression(X_train, y_train, X_test, y_test)
    random_forest_results[ticker] = train_random_forest(X_train, y_train, X_test, y_test, scaler_y)
    lstm_results[ticker] = train_lstm(X_train, y_train, X_test, y_test, scaler_y)

  super().__init__(**kwargs)


[1m609/609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


  super().__init__(**kwargs)


[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


  super().__init__(**kwargs)


[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


  super().__init__(**kwargs)


[1m846/846[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [10]:
print("Linear Regression Results:", linear_results)

Linear Regression Results: {'FPT': 0.8183147288351014, 'VIC': 0.8414866080176265, 'PNJ': 0.6129769835787839, 'MSN': 0.8183591786014576}


In [11]:
print("Random Forest Results:", random_forest_results)

Random Forest Results: {'FPT': 0.0038439401806865973, 'VIC': 0.038127656100330906, 'PNJ': 0.3051492191710746, 'MSN': 0.04089037565053573}


In [12]:
print("LSTM Results:", lstm_results)

LSTM Results: {'FPT': 0.0037370339945244015, 'VIC': 0.038007220390615465, 'PNJ': 0.5016347866365292, 'MSN': 0.046256256276829194}


# Nhận xét:
Sau quá trình thử nghiệm lựa chọn mô hình, nhận thấy mô hình Random Forest và mô hình học sâu LSTM có kết quả sai số nhỏ hơn Linear Regression, từ đó có thể lựa chọn mô hình Random Forest/LSTM để dự đoán giá cổ phiếu