In [2]:
!pip install gnews

Collecting gnews
  Downloading gnews-0.3.7-py3-none-any.whl.metadata (17 kB)
Collecting feedparser~=6.0.2 (from gnews)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting dnspython~=1.16.0 (from gnews)
  Downloading dnspython-1.16.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting sgmllib3k (from feedparser~=6.0.2->gnews)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading gnews-0.3.7-py3-none-any.whl (15 kB)
Downloading dnspython-1.16.0-py2.py3-none-any.whl (188 kB)
   ---------------------------------------- 0.0/188.4 kB ? eta -:--:--
   ------ -------------------------------- 30.7/188.4 kB 640.0 kB/s eta 0:00:01
   -------------- ------------------------ 71.7/188.4 kB 777.7 kB/s eta 0:00:01
   ---------------------- --------------- 112.6/188.4 kB 819.2 kB/s eta 0:00:01
   ------------------------ ------------- 122.9/188.4 kB 798.9 kB/s eta 0:00:01
  

In [3]:
from gnews import GNews


site_cnbc = "CNBC.com"
site_bloomberg = "bloomberg.com"
test = GNews.get_news_by_site(self=GNews(), site=site_bloomberg)


print(test[0])

{'title': 'BlackRock, Citadel Back Upstart Texas Challenge to NYSE, Nasdaq - Bloomberg', 'description': 'BlackRock, Citadel Back Upstart Texas Challenge to NYSE, Nasdaq  Bloomberg', 'published date': 'Wed, 05 Jun 2024 01:50:00 GMT', 'url': 'https://news.google.com/rss/articles/CBMicGh0dHBzOi8vd3d3LmJsb29tYmVyZy5jb20vbmV3cy9hcnRpY2xlcy8yMDI0LTA2LTA1L2JsYWNrcm9jay1jaXRhZGVsLWJhY2stdXBzdGFydC10ZXhhcy1jaGFsbGVuZ2UtdG8tbnlzZS1uYXNkYXHSAQA?oc=5&hl=en-US&gl=US&ceid=US:en', 'publisher': {'href': 'https://www.bloomberg.com', 'title': 'Bloomberg'}}


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

class FinancialNewsDataset(Dataset):
    def __init__(self, news_df, tokenizer, max_len):
        self.news_df = news_df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.news_df)

    def __getitem__(self, idx):
        headline = self.news_df.iloc[idx]['headline']
        inputs = self.tokenizer.encode_plus(
            headline,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
        }

# Parameters
max_len = 128
batch_size = 16

# Create dataset and dataloader
dataset = FinancialNewsDataset(news_df, tokenizer, max_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


NameError: name 'news_df' is not defined

In [None]:
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup

# Load pre-trained FinBERT model for masked language modeling
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(dataloader) * 3  # Number of epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids
        )
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    print(f'Epoch {epoch + 1}/{3}, Loss: {loss.item()}')


In [None]:
def embed_news_headlines(news_df, model, tokenizer, max_len):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for headline in news_df['headline']:
            inputs = tokenizer.encode_plus(
                headline,
                max_length=max_len,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(cls_embedding)
    embeddings = np.vstack(embeddings)
    return embeddings

news_embeddings = embed_news_headlines(news_df, model, tokenizer, max_len)
news_df['embedding'] = list(news_embeddings)

In [None]:
class SP500Dataset(Dataset):
    def __init__(self, news_df, price_df, n_days, max_len):
        self.news_df = news_df
        self.price_df = price_df
        self.n_days = n_days
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.model = model
        self.X, self.Y = self.create_features_labels()
        
    def create_features_labels(self):
        X, Y = [], []
        for i in range(self.n_days, len(self.price_df) - 1):
            news_subset = self.news_df.loc[self.news_df['date'].isin(self.price_df['date'].iloc[i-self.n_days:i])]
            if len(news_subset) < self.n_days:
                continue  # Skip if there are not enough news articles

            embeddings = np.vstack(news_subset['embedding'].values)
            log_returns = self.price_df['log_return'].iloc[i-self.n_days:i].values

            features = np.hstack([embeddings.flatten(), log_returns])
            X.append(features)
            
            Y.append(self.price_df['log_return'].iloc[i + 1])
        
        return np.array(X), np.array(Y)
    
    def __len__(
