<a href="https://colab.research.google.com/github/KillShotAK/UoS-Forex-Trading-Robot/blob/main/TTE_SPLIT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pickle

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_article(article):
    if not isinstance(article, str):
        return ""
    tokens = nltk.word_tokenize(article)
    disambiguated_tokens = [lesk(tokens, token) for token in tokens]
    disambiguated_tokens = [token for token in disambiguated_tokens if token]
    return ' '.join([token.name().split('.')[0] for token in disambiguated_tokens])

# Load the EUR/USD news data
news_df = pd.read_excel('/content/EURUSD_news.xlsx')

# Apply preprocessing
news_df['processed_body'] = news_df['articleBody'].apply(preprocess_article)

# Generate random sentiment labels for demonstration
np.random.seed(42)
news_df['sentiment'] = np.random.choice(['positive', 'neutral', 'negative'], size=len(news_df))

# Convert sentiment labels to numerical values
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
news_df['label'] = news_df['sentiment'].map(label_map)
labels = news_df['label'].tolist()

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# Ensure model is on the correct device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=512)

# Prepare dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(news_df['processed_body'].tolist(), labels, test_size=0.2)
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

class ForexDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ForexDataset(train_encodings, train_labels)
val_dataset = ForexDataset(val_encodings, val_labels)

# Fine-tune model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

# Predict sentiment
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return torch.argmax(probs, dim=1).item()

# Map the numerical labels back to sentiment labels
reverse_label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

news_df['predicted_label'] = news_df['processed_body'].apply(predict_sentiment)
news_df['predicted_sentiment'] = news_df['predicted_label'].map(reverse_label_map)

# Select the columns we want in our final output
output_df = news_df[['title', 'articleBody', 'Date', 'predicted_sentiment']]

# Function to save DataFrame to both Excel and Pickle formats
def save_dataframe(df, name):
    # Save to Excel
    df.to_excel(f'{name}.xlsx', index=False)
    # Save to Pickle
    with open(f'{name}.pkl', 'wb') as f:
        pickle.dump(df, f)

# Save the full dataset before splitting
save_dataframe(output_df, 'forex_sentiment_full')

# Split the data into train, test, and validation sets
train_df, temp_df = train_test_split(output_df, test_size=0.3, random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save the split datasets
save_dataframe(train_df, 'forex_sentiment_train')
save_dataframe(test_df, 'forex_sentiment_test')
save_dataframe(val_df, 'forex_sentiment_val')

print("Data has been processed and saved to both Excel and Pickle files.")
print(f"Full set size: {len(output_df)}")
print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Validation set size: {len(val_df)}")

# Example of how to load the Pickle files
# Uncomment these lines if you want to verify the saved Pickle files
with open('forex_sentiment_full.pkl', 'rb') as f:
    loaded_full_df = pickle.load(f)
print(loaded_full_df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Step,Training Loss
500,1.4527
1000,1.1952


Data has been processed and saved to both Excel and Pickle files.
Full set size: 7413
Train set size: 5189
Test set size: 1112
Validation set size: 1112
                                               title  \
0  EUR/USD pulls back on Friday, still heads for ...   
1  EUR/USD: Fed should help for another visit to ...   
2  EUR/USD: Wider US-DE yield spreads and risk-of...   
3  Forex Today: A hectic weeks kicks off with a s...   
4      EUR/USD probing lows near 1.1720 ahead of IFO   

                                         articleBody                  Date  \
0  US dollar recovers ground as Euro gets hits by...  2018-09-21T16:34:04Z   
1  Next week, the Federal Reserve will meet. Acco...  2018-09-21T17:41:48Z   
2  The EUR/USD fell on Friday, tracking the 10-ye...  2018-09-24T03:44:36Z   
3  Markets opened quietly this week, belying the ...  2018-09-24T05:19:56Z   
4  The pair has started the week on a soft footin...  2018-09-24T07:02:33Z   

  predicted_sentiment  
0            posi