In [1]:
import pandas as pd
from scipy.special import softmax
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertForSequenceClassification,
    BertTokenizer,
    pipeline,
)

tqdm.pandas()

2024-12-22 02:25:49.669640: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
file_path = '../Crawl_Data/Data/yahoo_news_20241221_044513.csv'
df = pd.read_csv(file_path, )
df.head()

Unnamed: 0.1,Unnamed: 0,TITLE,LINK,CONTENT,RELEASE_TIME
0,0,"Tesla recall, Netflix-FIFA deal, Apple: Market...",https://finance.yahoo.com/video/tesla-recall-n...,Yahoo Finance host Rachelle Akuffo tackles thr...,3 hours ago
1,1,Elon Musk Told The U.N. If They Could Show A P...,https://finance.yahoo.com/news/elon-musk-told-...,In a world where hunger kills more people each...,9 minutes ago
2,2,"Magnificent Seven Stocks: Nvidia, Tesla Revers...",https://finance.yahoo.com/m/4205eaa9-f620-3a0b...,"Dubbed the Magnificent Seven stocks, Apple, Mi...",14 minutes ago
3,3,"These Stocks Are Moving the Most Today: FedEx,...",https://finance.yahoo.com/m/57c66b45-0b6c-3ce7...,"FedEx, the shipping and logistics giant, plans...",1 hour ago
4,4,Hyundai and Samsung in talks for automotive ch...,https://finance.yahoo.com/news/hyundai-samsung...,Hyundai aims to establish a domestic supply ch...,1 hour ago


In [3]:
print(df.shape)
df.columns.tolist()

(400, 5)


['Unnamed: 0', 'TITLE', 'LINK', 'CONTENT', 'RELEASE_TIME']

In [4]:
df.isnull().sum().sum()

0

In [5]:
df = df[['TITLE','CONTENT','RELEASE_TIME']]
df = df.reset_index().rename(columns={'index':'Id'})
df.head()

Unnamed: 0,Id,TITLE,CONTENT,RELEASE_TIME
0,0,"Tesla recall, Netflix-FIFA deal, Apple: Market...",Yahoo Finance host Rachelle Akuffo tackles thr...,3 hours ago
1,1,Elon Musk Told The U.N. If They Could Show A P...,In a world where hunger kills more people each...,9 minutes ago
2,2,"Magnificent Seven Stocks: Nvidia, Tesla Revers...","Dubbed the Magnificent Seven stocks, Apple, Mi...",14 minutes ago
3,3,"These Stocks Are Moving the Most Today: FedEx,...","FedEx, the shipping and logistics giant, plans...",1 hour ago
4,4,Hyundai and Samsung in talks for automotive ch...,Hyundai aims to establish a domestic supply ch...,1 hour ago


In [6]:
text_example = df['CONTENT'][2]
text_example

'Dubbed the Magnificent Seven stocks, Apple, Microsoft, Google parent Alphabet, Amazon.com, Nvidia, Meta Platforms and Tesla lived up to their name in 2023 with big gains. And all of them boasted solid year-to-date gains in the final month of 2024.'

## FinancialBERT

In [7]:
Fin_model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
Fin_tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")



In [8]:
def polarity_scores_financialBERT(text):
    encoded_text = Fin_tokenizer(text, return_tensors='pt')
    output = Fin_model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'financialBERT_neg': scores[0],
        'financialBERT_neu': scores[1],
        'financialBERT_pos': scores[2]
    }

    return scores_dict

In [9]:
res = polarity_scores_financialBERT(text_example)
res

{'financialBERT_neg': 8.471948e-05,
 'financialBERT_neu': 8.4637024e-05,
 'financialBERT_pos': 0.9998306}

## DistilRoberta

In [10]:
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
distilroberta_tokenizer = AutoTokenizer.from_pretrained(model_name)
distilroberta_model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [11]:
def polarity_scores_distilRoberta(text):
    encoded_text = distilroberta_tokenizer(text, return_tensors='pt')
    output = distilroberta_model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'distilRoberta_neg': scores[0],
        'distilRoberta_neu': scores[1],
        'distilRoberta_pos': scores[2]
    }

    return scores_dict

In [12]:
res = polarity_scores_distilRoberta(text_example)
res

{'distilRoberta_neg': 0.00024128746,
 'distilRoberta_neu': 8.422791e-05,
 'distilRoberta_pos': 0.99967444}

In [13]:
sentiment_results = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        title = row['TITLE']
        content = row['CONTENT']
        cid = row['Id']
        vader_results = polarity_scores_financialBERT(content)
        roberta_results = polarity_scores_distilRoberta(content)
        both = {**vader_results, **roberta_results}
        sentiment_results[cid] = both
    except RuntimeError:
        print(f'Broke for cid {cid}')

  0%|          | 0/400 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 400/400 [02:50<00:00,  2.35it/s]


In [14]:
results_df = pd.DataFrame(sentiment_results).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df.head()

Unnamed: 0,Id,financialBERT_neg,financialBERT_neu,financialBERT_pos,distilRoberta_neg,distilRoberta_neu,distilRoberta_pos
0,0,0.00028,0.996714,0.003006,0.000332,0.001653,0.998015
1,1,0.00257,0.98041,0.017021,0.000523,0.981101,0.018375
2,2,8.5e-05,8.5e-05,0.999831,0.000241,8.4e-05,0.999674
3,3,0.008248,0.074243,0.91751,0.004953,0.000112,0.994934
4,4,9.5e-05,0.99983,7.5e-05,5.7e-05,0.999788,0.000155


## The Transformers Pipeline

In [15]:
Financial_pipe = pipeline("sentiment-analysis", model=Fin_model, tokenizer=Fin_tokenizer)
Distilroberta_pipe = pipeline("text-classification", model=distilroberta_model, tokenizer=distilroberta_tokenizer)

In [16]:
print(Financial_pipe(text_example))
Distilroberta_pipe(text_example)

[{'label': 'positive', 'score': 0.9998306035995483}]


[{'label': 'positive', 'score': 0.9996744394302368}]

In [17]:
def get_sentiment_through_pipeline(text, pipeline):
    try:
        # Get the sentiment result for the text
        result = pipeline(text)[0] 
        return result['label'], result['score']
    except Exception as e:
        print(f"Error processing text: {text} - {e}")
        return None, None
    
df[['FinancialBERT_label', 'FinancialBERT_score']] = df['CONTENT'].progress_apply(lambda x: pd.Series(get_sentiment_through_pipeline(x, Financial_pipe)))
df[['Distilroberta_label', 'Distilroberta_score']] = df['CONTENT'].progress_apply(lambda x: pd.Series(get_sentiment_through_pipeline(x, Distilroberta_pipe)))

100%|██████████| 400/400 [01:57<00:00,  3.41it/s]
100%|██████████| 400/400 [00:52<00:00,  7.55it/s]


In [18]:
results_df = results_df.merge(df, how='left')
results_df.head()

Unnamed: 0,Id,financialBERT_neg,financialBERT_neu,financialBERT_pos,distilRoberta_neg,distilRoberta_neu,distilRoberta_pos,TITLE,CONTENT,RELEASE_TIME,FinancialBERT_label,FinancialBERT_score,Distilroberta_label,Distilroberta_score
0,0,0.00028,0.996714,0.003006,0.000332,0.001653,0.998015,"Tesla recall, Netflix-FIFA deal, Apple: Market...",Yahoo Finance host Rachelle Akuffo tackles thr...,3 hours ago,neutral,0.996714,positive,0.998015
1,1,0.00257,0.98041,0.017021,0.000523,0.981101,0.018375,Elon Musk Told The U.N. If They Could Show A P...,In a world where hunger kills more people each...,9 minutes ago,neutral,0.98041,neutral,0.981101
2,2,8.5e-05,8.5e-05,0.999831,0.000241,8.4e-05,0.999674,"Magnificent Seven Stocks: Nvidia, Tesla Revers...","Dubbed the Magnificent Seven stocks, Apple, Mi...",14 minutes ago,positive,0.999831,positive,0.999674
3,3,0.008248,0.074243,0.91751,0.004953,0.000112,0.994934,"These Stocks Are Moving the Most Today: FedEx,...","FedEx, the shipping and logistics giant, plans...",1 hour ago,positive,0.91751,positive,0.994934
4,4,9.5e-05,0.99983,7.5e-05,5.7e-05,0.999788,0.000155,Hyundai and Samsung in talks for automotive ch...,Hyundai aims to establish a domestic supply ch...,1 hour ago,neutral,0.99983,neutral,0.999788


In [20]:
results_df.to_excel('financial_news_results.xlsx', index=False)

In [19]:
# mismatched_count = (results_df['FinancialBERT_label'] != results_df['Distilroberta_label']).sum()
# mismatched_count