In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

**shift(0,30,60,90,120,150,180,210,240,270,300,330,360)**

In [None]:
df = pd.read_csv("UK_research2.csv", encoding="utf-8").drop(columns="Unnamed: 0")
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
#df[['Headline', 'gold', 'oil', 'bond', 'cad', 'cny', 'eur', 'jpy', 'usd']] = df[['Headline', 'gold', 'oil', 'bond', 'cad', 'cny', 'eur', 'jpy', 'usd']].shift(30)
df = df.dropna().reset_index(drop=True)
df

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['gold', 'oil', 'bond', 'cad', 'cny', 'eur', 'jpy', 'usd']] = scaler.fit_transform(df[['gold', 'oil', 'bond', 'cad', 'cny', 'eur', 'jpy', 'usd']])
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
stopwords_object = stopwords.words('english')
df["Headline"] = df["Headline"].str.lower()

def preprocessing_text(data):
    if type(data) is float:
      data = str(data);

    data = re.sub("[^a-zA-Z]"," ", data)

    tokens = word_tokenize(data)
    word_tokens = [w for w in tokens if w not in stopwords_object]


    stemmer = PorterStemmer()
    stem_words = [stemmer.stem(w) for w in word_tokens]


    lemmer = WordNetLemmatizer()
    lem_words = [lemmer.lemmatize(w) for w in stem_words]

    return " ".join(lem_words)

In [None]:
df["Headline2"] = df["Headline"].apply(preprocessing_text)
df

## **TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [None]:
tfidf_result = tfidf.fit_transform(df["Headline2"])
df["tfidf_Headline"] = list(tfidf_result.toarray())

In [None]:
df["tfidf_Headline"] = np.array(df['tfidf_Headline'].tolist())
X = df[['tfidf_Headline', 'gold', 'oil', 'bond', 'cad', 'cny', 'eur', 'jpy', 'usd']]
y = df["ETF"].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

### **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

rf = RandomForestRegressor(n_estimators=100, max_depth=20, min_samples_split=2, random_state=42)
rf.fit(X_train, y_train)

rf_predictions = rf.predict(X_test)

rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)
print(f"Random Forest RMSE: {rf_rmse}")

rf_mae = mean_absolute_error(y_test, rf_predictions)
print(f"Random Forest MAE: {rf_mae}")

rf_mape = mean_absolute_percentage_error(y_test, rf_predictions)
print(f"Random Forest MAPE: {rf_mape}")

r2 = r2_score(y_test, rf_predictions)
print(f'Random Forest R제곱: {r2}')

In [None]:
feature_importances = rf.feature_importances_

for feature, importance in zip(X_train.columns, feature_importances):
    print(f"{feature}: {importance}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 3))
plt.plot(y_test, label='Actual', color='green')
plt.plot(rf_predictions, label='Predicted', color='orange', linestyle='-')
plt.title('M+1')
plt.legend()
plt.show()

### **LSTM**

In [None]:
from tensorflow import keras
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.layers import PReLU

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


X_lstm_train = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
y_lstm_train = np.reshape(y_train, (y_train.shape[0], 1))

X_lstm_val = np.reshape(X_val.values, (X_val.shape[0], X_val.shape[1], 1))
y_lstm_val = np.reshape(y_val, (y_val.shape[0], 1))

X_lstm_test = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))
y_lstm_test = np.reshape(y_test, (y_test.shape[0], 1))

lstm_model = Sequential()
lstm_model.add(LSTM(784, input_shape=(X_lstm_train.shape[1], X_lstm_train.shape[2])))
#lstm_model.add(Dense(100, activation='relu'))
lstm_model.add(Dense(256))
lstm_model.add(PReLU())
#lstm_model.add(Dense(50, activation='relu'))
lstm_model.add(Dense(64))
lstm_model.add(PReLU())
lstm_model.add(Dense(1, activation='linear'))
lstm_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mape'])

In [None]:
lstm_model.summary()

In [None]:
history = lstm_model.fit(X_lstm_train, y_lstm_train, epochs=300, batch_size=32,
                         validation_data=(X_lstm_val, y_lstm_val),verbose=1)

lstm_predictions = lstm_model.predict(X_lstm_test)

lstm_rmse = mean_squared_error(y_lstm_test, lstm_predictions, squared=False)
print(f"LSTM RMSE: {lstm_rmse}")

lstm_mae = mean_absolute_error(y_lstm_test, lstm_predictions)
print(f"LSTM MAE: {lstm_mae}")

lstm_mape = mean_absolute_percentage_error(y_lstm_test, lstm_predictions)
print(f"LSTM MAPE: {lstm_mape}")

r2 = r2_score(y_lstm_test, lstm_predictions)
print(f'LSTM R제곱: {r2}')

## **Hybrid**

In [None]:
X = df[['gold', 'oil', 'cny', 'jpy', 'usd']]
y = df["ETF"].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


X_lstm_train = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
y_lstm_train = np.reshape(y_train, (y_train.shape[0], 1))

X_lstm_val = np.reshape(X_val.values, (X_val.shape[0], X_val.shape[1], 1))
y_lstm_val = np.reshape(y_val, (y_val.shape[0], 1))

X_lstm_test = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))
y_lstm_test = np.reshape(y_test, (y_test.shape[0], 1))

lstm_model = Sequential()
lstm_model.add(LSTM(784, input_shape=(X_lstm_train.shape[1], X_lstm_train.shape[2])))
#lstm_model.add(Dense(100, activation='relu'))
lstm_model.add(Dense(256))
lstm_model.add(PReLU())
#lstm_model.add(Dense(50, activation='relu'))
lstm_model.add(Dense(64))
lstm_model.add(PReLU())
lstm_model.add(Dense(1, activation='linear'))
lstm_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mape'])

history = lstm_model.fit(X_lstm_train, y_lstm_train, epochs=300, batch_size=32,
                         validation_data=(X_lstm_val, y_lstm_val),verbose=1)

lstm_predictions = lstm_model.predict(X_lstm_test)

lstm_rmse = mean_squared_error(y_lstm_test, lstm_predictions, squared=False)
print(f"LSTM RMSE: {lstm_rmse}")

lstm_mae = mean_absolute_error(y_lstm_test, lstm_predictions)
print(f"LSTM MAE: {lstm_mae}")

lstm_mape = mean_absolute_percentage_error(y_lstm_test, lstm_predictions)
print(f"LSTM MAPE: {lstm_mape}")

r2 = r2_score(y_lstm_test, lstm_predictions)
print(f'LSTM R제곱: {r2}')

## **BERT**

In [None]:
!pip install pandas transformers torch

In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

df['headline']=df["Headline"]

In [None]:
tokenized_headlines = df['headline'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = max(map(len, tokenized_headlines))
padded_headlines = torch.tensor([i + [0]*(max_len-len(i)) for i in tokenized_headlines])

In [None]:
attention_mask = (padded_headlines != 0).float()

In [None]:
with torch.no_grad():
    outputs = model(padded_headlines, attention_mask=attention_mask)

In [None]:
bert_embeddings = outputs.last_hidden_state[:, 0, :]

In [None]:
bert_embeddings.shape

In [None]:
bert_headline = bert_embeddings.tolist()
df['bert_headline'] = bert_headline
df

## **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score


df["bert_headline"] = np.array(df['bert_headline'].tolist())
X = df[['bert_headline', 'gold', 'oil', 'bond', 'cad', 'cny', 'eur', 'jpy', 'usd']]
y = df["ETF"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

rf2 = RandomForestRegressor(n_estimators=100, max_depth=20, min_samples_split=2, random_state=20)
rf2.fit(X_train, y_train)

rf2_predictions = rf2.predict(X_test)

rf_rmse = mean_squared_error(y_test, rf2_predictions, squared=False)
print(f"Random Forest RMSE: {rf_rmse}")

rf_mae = mean_absolute_error(y_test, rf2_predictions)
print(f"Random Forest MAE: {rf_mae}")

rf_mape = mean_absolute_percentage_error(y_test, rf2_predictions)
print(f"Random Forest MAPE: {rf_mape}")

r2 = r2_score(y_test, rf2_predictions)
print(f'Random Forest R제곱: {r2}')

In [None]:
feature_importances = rf2.feature_importances_

for feature, importance in zip(X_train.columns, feature_importances):
    print(f"{feature}: {importance}")

## **LSTM**

In [None]:
from keras.layers import PReLU

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense

df["bert_headline"] = np.array(df['bert_headline'].tolist())
X = df[['bert_headline', 'gold', 'oil', 'bond', 'cad', 'cny', 'eur', 'jpy', 'usd']]
y = df["ETF"].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_lstm_train = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
y_lstm_train = np.reshape(y_train, (y_train.shape[0], 1))

X_lstm_val = np.reshape(X_val.values, (X_val.shape[0], X_val.shape[1], 1))
y_lstm_val = np.reshape(y_val, (y_val.shape[0], 1))

X_lstm_test = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))
y_lstm_test = np.reshape(y_test, (y_test.shape[0], 1))

lstm2_model = Sequential()
lstm2_model.add(LSTM(784, input_shape=(X_lstm_train.shape[1], X_lstm_train.shape[2])))
lstm2_model.add(Dense(256))
lstm2_model.add(PReLU())
lstm2_model.add(Dense(64))
lstm2_model.add(PReLU())
lstm2_model.add(Dense(1, activation='linear'))
lstm2_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mape'])

In [None]:
lstm2_model.summary()

In [None]:
history = lstm2_model.fit(X_lstm_train, y_lstm_train, epochs=300, batch_size=32,
                         validation_data=(X_lstm_val, y_lstm_val),verbose=1)

lstm_predictions = lstm2_model.predict(X_lstm_test)

lstm_rmse = mean_squared_error(y_lstm_test, lstm_predictions, squared=False)
print(f"LSTM RMSE: {lstm_rmse}")

lstm_mae = mean_absolute_error(y_lstm_test, lstm_predictions)
print(f"LSTM MAE: {lstm_mae}")

lstm_mape = mean_absolute_percentage_error(y_lstm_test, lstm_predictions)
print(f"LSTM MAPE: {lstm_mape}")

r2 = r2_score(y_lstm_test, lstm_predictions)
print(f'LSTM R제곱: {r2}')

## **Hybrid**

In [None]:
X = df[['gold', 'oil', 'cny', 'jpy', 'usd']]
y = df["ETF"].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_lstm_train = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
y_lstm_train = np.reshape(y_train, (y_train.shape[0], 1))

X_lstm_val = np.reshape(X_val.values, (X_val.shape[0], X_val.shape[1], 1))
y_lstm_val = np.reshape(y_val, (y_val.shape[0], 1))

X_lstm_test = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))
y_lstm_test = np.reshape(y_test, (y_test.shape[0], 1))

lstm2_model = Sequential()
lstm2_model.add(LSTM(784, input_shape=(X_lstm_train.shape[1], X_lstm_train.shape[2])))
lstm2_model.add(Dense(256))
lstm2_model.add(PReLU())
lstm2_model.add(Dense(64))
lstm2_model.add(PReLU())
lstm2_model.add(Dense(1, activation='linear'))
lstm2_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mape'])

history = lstm2_model.fit(X_lstm_train, y_lstm_train, epochs=300, batch_size=32,
                         validation_data=(X_lstm_val, y_lstm_val),verbose=1)

lstm_predictions = lstm2_model.predict(X_lstm_test)

lstm_rmse = mean_squared_error(y_lstm_test, lstm_predictions, squared=False)
print(f"LSTM RMSE: {lstm_rmse}")

lstm_mae = mean_absolute_error(y_lstm_test, lstm_predictions)
print(f"LSTM MAE: {lstm_mae}")

lstm_mape = mean_absolute_percentage_error(y_lstm_test, lstm_predictions)
print(f"LSTM MAPE: {lstm_mape}")

r2 = r2_score(y_lstm_test, lstm_predictions)
print(f'LSTM R제곱: {r2}')