# US

In [12]:
import pandas as pd

In [13]:
us_news = pd.read_csv("data_final/us_news.csv")
us_news

Unnamed: 0.1,Unnamed: 0,Headlines,Date
0,0,"Trump tax cut to dent BP profits by $1.5bn, co...",2018-01-02
1,1,The major indexes trade lower in January,2018-01-02
2,2,UK services grow faster than forecast despite ...,2018-01-04
3,3,Nils Pratley on finance \n\n\n 'Melt-up' coi...,2018-01-05
4,4,Last year's S&P 500 losers could be prime for ...,2018-01-05
...,...,...,...
3995,3995,Tesla must face lawsuit claiming racism at Cal...,2019-12-31
3996,3996,White House adviser says China trade deal sign...,2019-12-31
3997,3997,UK minimum wage to rise by four times rate of ...,2019-12-31
3998,3998,"Flotations, corporate collapses and Brexit: th...",2019-12-31


In [14]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Load FinBERT model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

texts = us_news["Headlines"].astype(str).tolist()

# Label map
label_map = {0: "negative", 1: "neutral", 2: "positive"}

# Inference function
def predict_sentiment(texts, batch_size=32):
    sentiments = []
    scores = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        labels = torch.argmax(probs, dim=1).tolist()
        sentiments.extend([label_map[label] for label in labels])
        scores.extend(probs.tolist())

    return sentiments, scores

# Run
sentiments, scores = predict_sentiment(texts)

# Save results
us_news["finbert_sentiment"] = sentiments
us_news["finbert_pos_score"] = [s[2] for s in scores]
us_news["finbert_neu_score"] = [s[1] for s in scores]
us_news["finbert_neg_score"] = [s[0] for s in scores]


100%|██████████| 125/125 [00:24<00:00,  5.07it/s]


In [15]:
from sklearn.preprocessing import MinMaxScaler

us_news["finbert_sentiment_score"] = (
    us_news["finbert_pos_score"] - us_news["finbert_neg_score"]
)

scaler = MinMaxScaler(feature_range=(-1, 1))
us_news["score"] = scaler.fit_transform(
    us_news[["finbert_sentiment_score"]]
)

In [16]:
us_news = us_news.drop(columns = ['finbert_sentiment', 'finbert_pos_score', 'finbert_neu_score', 'finbert_neg_score', 'finbert_sentiment_score'])
us_news

Unnamed: 0.1,Unnamed: 0,Headlines,Date,score
0,0,"Trump tax cut to dent BP profits by $1.5bn, co...",2018-01-02,-0.313838
1,1,The major indexes trade lower in January,2018-01-02,0.019266
2,2,UK services grow faster than forecast despite ...,2018-01-04,-0.991852
3,3,Nils Pratley on finance \n\n\n 'Melt-up' coi...,2018-01-05,0.130513
4,4,Last year's S&P 500 losers could be prime for ...,2018-01-05,0.742684
...,...,...,...,...
3995,3995,Tesla must face lawsuit claiming racism at Cal...,2019-12-31,0.144928
3996,3996,White House adviser says China trade deal sign...,2019-12-31,-0.075984
3997,3997,UK minimum wage to rise by four times rate of ...,2019-12-31,-0.765114
3998,3998,"Flotations, corporate collapses and Brexit: th...",2019-12-31,0.811519


In [6]:
us_news.to_csv("data_final/us_news_scored.csv")