<a href="https://colab.research.google.com/github/KillShotAK/UoS-Forex-Trading-Robot/blob/main/wsdprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install transformers
!pip install nltk
!pip install pandas
!pip install openpyxl



In [7]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_article(article):
    if not isinstance(article, str):
        return ""
    tokens = nltk.word_tokenize(article)
    disambiguated_tokens = [lesk(tokens, token) for token in tokens]
    # Remove None values
    disambiguated_tokens = [token for token in disambiguated_tokens if token]
    return ' '.join([token.name().split('.')[0] for token in disambiguated_tokens])

# Load the EUR/USD news data
news_df = pd.read_excel('/content/EURUSD_news.xlsx')

# Apply preprocessing
news_df['processed_body'] = news_df['articleBody'].apply(preprocess_article)

# Display the first few rows of the dataset
news_df[['title', 'processed_body']].head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,title,processed_body
0,"EUR/USD pulls back on Friday, still heads for ...",uracil dollar recover ground arsenic euro get ...
1,EUR/USD: Fed should help for another visit to ...,next workweek federal substitute will meet har...
2,EUR/USD: Wider US-DE yield spreads and risk-of...,fell on friday trailing spread between uracil ...
3,Forex Today: A hectic weeks kicks off with a s...,marketplace unfold quietly workweek contradict...
4,EUR/USD probing lows near 1.1720 ahead of IFO,pair hour_angle begin workweek on deoxyadenosi...


In [8]:
import numpy as np

# Generate random sentiment labels for demonstration
np.random.seed(42)
news_df['sentiment'] = np.random.choice(['positive', 'neutral', 'negative'], size=len(news_df))

# Display the first few rows with sentiment labels
news_df[['title', 'processed_body', 'sentiment']].head()

Unnamed: 0,title,processed_body,sentiment
0,"EUR/USD pulls back on Friday, still heads for ...",uracil dollar recover ground arsenic euro get ...,negative
1,EUR/USD: Fed should help for another visit to ...,next workweek federal substitute will meet har...,positive
2,EUR/USD: Wider US-DE yield spreads and risk-of...,fell on friday trailing spread between uracil ...,negative
3,Forex Today: A hectic weeks kicks off with a s...,marketplace unfold quietly workweek contradict...,negative
4,EUR/USD probing lows near 1.1720 ahead of IFO,pair hour_angle begin workweek on deoxyadenosi...,positive


In [9]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Convert sentiment labels to numerical values
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
news_df['label'] = news_df['sentiment'].map(label_map)
labels = news_df['label'].tolist()

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# Ensure model is on the correct device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=512)

# Prepare dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(news_df['processed_body'].tolist(), labels, test_size=0.2)
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

class ForexDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ForexDataset(train_encodings, train_labels)
val_dataset = ForexDataset(val_encodings, val_labels)

# Fine-tune model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


Step,Training Loss
500,1.4527
1000,1.1952


TrainOutput(global_step=1113, training_loss=1.3049306535335243, metrics={'train_runtime': 1650.0346, 'train_samples_per_second': 10.782, 'train_steps_per_second': 0.675, 'total_flos': 4680787701381120.0, 'train_loss': 1.3049306535335243, 'epoch': 3.0})

In [10]:
# Predict sentiment
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return torch.argmax(probs, dim=1).item()

# Map the numerical labels back to sentiment labels
reverse_label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

news_df['predicted_label'] = news_df['processed_body'].apply(predict_sentiment)
news_df['predicted_sentiment'] = news_df['predicted_label'].map(reverse_label_map)

# Display the first few rows with predicted sentiment
print(news_df[['title', 'predicted_sentiment']].head())

# Count the sentiment labels
sentiment_counts = news_df['predicted_sentiment'].value_counts()
print(sentiment_counts)


                                               title predicted_sentiment
0  EUR/USD pulls back on Friday, still heads for ...            positive
1  EUR/USD: Fed should help for another visit to ...            positive
2  EUR/USD: Wider US-DE yield spreads and risk-of...             neutral
3  Forex Today: A hectic weeks kicks off with a s...            positive
4      EUR/USD probing lows near 1.1720 ahead of IFO            negative
predicted_sentiment
positive    3424
neutral     2016
negative    1973
Name: count, dtype: int64
