In [1]:
import pandas as pd
import torch
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
news_df = pd.read_csv('data/news.csv')
stock_df = pd.read_csv('data/price.csv')
news_df.head()

Unnamed: 0,publication_datetime,title,body,tickers
0,2017-01-03,World News: Police Question Netanyahu Over Gifts,"""We pay attention to publications in the media...",EL
1,2017-01-03,Business News: Nestle Turns to New CEO for Hea...,"Nestle, the world's largest packaged-food comp...",GIS
2,2017-01-03,Business News: Vermont Drug Law Faces Limits -...,"The Vermont law, enacted in June, instructed s...",ABBV
3,2017-01-03,Life & Arts -- Travel: How Hotel Companies Lau...,Travelers are about to see a flurry of new hot...,HLT
4,2017-01-03,Businesses Ready to Ramp Up Investment --- Aft...,The Federal Reserve last month signaled intere...,HD


## Pretrained FinBERT

This code defines a function `FinBERT_sentiment_score` that analyzes financial text to determine its sentiment using a pre-trained FinBERT model. The function:

1. Loads the "yiyanghkust/finbert-tone" pre-trained model and tokenizer
2. Processes input text (accepting either strings or lists of text)
3. Tokenizes and truncates the text if it exceeds the maximum token length
4. Performs inference using the FinBERT model to get sentiment predictions
5. Converts the model's output (which has three classes: negative, neutral, positive) into a normalized score on a -1 to 1 scale:
    * Negative sentiment returns a negative value (-1 to 0)
    * Neutral sentiment returns exactly 0
    * Positive sentiment returns a positive value (0 to 1)

The magnitude of the score represents the confidence level of the prediction. If any errors occur during processing, the function defaults to returning 0 (neutral sentiment). 

In [3]:
def FinBERT_sentiment_score(heading, max_length=512):
    """
    compute sentiment score using pretrained FinBERT on -1 to 1 scale. -1 being negative and 1 being positive
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-tone')
        finbert = AutoModelForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')
        
        # If heading is a list, join it into a single string
        if isinstance(heading, list):
            heading = ' '.join(heading)
            
        # Truncate text if it's too long
        tokens = tokenizer(heading, truncation=True, max_length=max_length, return_tensors="pt")
        
        # Get prediction
        with torch.no_grad():
            outputs = finbert(**tokens)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
        # Get the highest probability class
        predicted_class = torch.argmax(predictions).item()
        score = predictions[0][predicted_class].item()
        
        # Map the prediction to -1 to 1 scale
        if predicted_class == 0:  # negative
            return -score
        elif predicted_class == 1:  # neutral
            return 0
        else:  # positive
            return score
            
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return 0  # Return neutral sentiment in case of error


## VADER Sentiment

This function performs sentiment analysis on text using VADER (Valence Aware Dictionary and sEntiment Reasoner), a lexicon and rule-based sentiment analysis tool. It:

1. Downloads the VADER lexicon (silently) if not already present
2. Creates a SentimentIntensityAnalyzer from NLTK
3. Handles input text whether it's a string or list format
4. Analyzes the text to get polarity scores (positive, negative, neutral)
5. Converts the analysis into a normalized score from -1 to 1:
    * Returns positive score (0 to 1) if positive sentiment dominates
    * Returns negative score (-1 to 0) if negative sentiment dominates
    * Returns neutral score if neutral sentiment dominates*
6. Returns 0 (neutral) if any errors occur during processing

Unlike the FinBERT function which uses a deep learning model specifically trained for financial text, this function uses a general-purpose lexicon-based approach that may be less specialized but computationally lighter.

In [4]:

def VADER_sentiment_score(heading):
    """
    compute sentiment score using pretrained VADER on -1 to 1 scale. -1 being negative and 1 being positive
    """
    try:
        nltk.download('vader_lexicon', quiet=True)
        analyzer = SentimentIntensityAnalyzer()
        
        # If heading is a list, join it into a single string
        if isinstance(heading, list):
            heading = ' '.join(heading)
            
        result = analyzer.polarity_scores(heading)
        if result['pos'] == max(result['neg'], result['neu'], result['pos']):
            return result['pos']
        if result['neg'] == max(result['neg'], result['neu'], result['pos']):
            return (0 - result['neg'])
        else:
            return result['neu']
    except Exception as e:
        print(f"Error processing text with VADER: {str(e)}")
        return 0


In [5]:

# Process news data in batches
def process_news_batch(news_df, batch_size=100):
    BERT_sentiment = []
    VADER_sentiment = []
    
    for i in range(0, len(news_df), batch_size):
        batch = news_df.iloc[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(news_df)-1)//batch_size + 1}")
        
        for idx in range(len(batch)):
            news_list = batch.iloc[idx]['title']
            news_body = batch.iloc[idx]['body']
            
            # Get sentiment scores
            score_BERT = FinBERT_sentiment_score(news_body)
            score_VADER = VADER_sentiment_score(news_list)
            
            BERT_sentiment.append(score_BERT)
            VADER_sentiment.append(score_VADER)
            
            # Clear GPU memory if using CUDA
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    return BERT_sentiment, VADER_sentiment


In [6]:
# Load and process data
news_df = pd.read_csv("data/news.csv")
BERT_sentiment, VADER_sentiment = process_news_batch(news_df)

# Add sentiment scores to dataframe
news_df['FinBERT score'] = BERT_sentiment
news_df['VADER score'] = VADER_sentiment
news_df['combined_sentiment'] = (news_df['FinBERT score'] + news_df['VADER score']) / 2


Processing batch 1/206
Processing batch 2/206
Processing batch 3/206
Processing batch 4/206
Processing batch 5/206
Processing batch 6/206
Processing batch 7/206
Processing batch 8/206
Processing batch 9/206
Processing batch 10/206
Processing batch 11/206
Processing batch 12/206
Processing batch 13/206
Processing batch 14/206
Processing batch 15/206
Processing batch 16/206
Processing batch 17/206
Processing batch 18/206
Processing batch 19/206
Processing batch 20/206
Processing batch 21/206
Processing batch 22/206
Processing batch 23/206
Processing batch 24/206
Processing batch 25/206
Processing batch 26/206
Processing batch 27/206
Processing batch 28/206
Processing batch 29/206
Processing batch 30/206
Processing batch 31/206
Processing batch 32/206
Processing batch 33/206
Processing batch 34/206
Processing batch 35/206
Processing batch 36/206
Processing batch 37/206
Processing batch 38/206
Processing batch 39/206
Processing batch 40/206
Processing batch 41/206
Processing batch 42/206
P

In [7]:
# Save results
news_df.to_csv("data/news_w_sentiment2.csv", index=False)