# US

In [17]:
import pandas as pd

In [18]:
us_news = pd.read_csv("data_final/us_news.csv")
us_news

Unnamed: 0,Headlines,Date
0,Bankers work around the clock to iron out EU f...,2018-01-02
1,"Business live UK, US and eurozone manufacturi...",2018-01-02
2,The major indexes trade lower in January,2018-01-02
3,UK construction industry optimism slumps to fi...,2018-01-03
4,US blocks MoneyGram sale to Alibaba boss over ...,2018-01-04
...,...,...
3995,White House adviser says China trade deal sign...,2019-12-31
3996,White House expecting agreement with China 'wi...,2019-12-31
3997,"Global stocks end 2019 near record highs, doll...",2019-12-31
3998,The 2010s: what just happened? \n\n\n It sho...,2019-12-31


In [19]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Load FinBERT model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

texts = us_news["Headlines"].astype(str).tolist()

# Label map
label_map = {0: "negative", 1: "neutral", 2: "positive"}

# Inference function
def predict_sentiment(texts, batch_size=32):
    sentiments = []
    scores = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        labels = torch.argmax(probs, dim=1).tolist()
        sentiments.extend([label_map[label] for label in labels])
        scores.extend(probs.tolist())

    return sentiments, scores

# Run
sentiments, scores = predict_sentiment(texts)

# Save results
us_news["finbert_sentiment"] = sentiments
us_news["finbert_pos_score"] = [s[2] for s in scores]
us_news["finbert_neu_score"] = [s[1] for s in scores]
us_news["finbert_neg_score"] = [s[0] for s in scores]


100%|██████████| 125/125 [00:24<00:00,  5.19it/s]


In [20]:
from sklearn.preprocessing import MinMaxScaler

us_news["finbert_sentiment_score"] = (
    us_news["finbert_pos_score"] - us_news["finbert_neg_score"]
)

scaler = MinMaxScaler(feature_range=(-1, 1))
us_news["score"] = scaler.fit_transform(
    us_news[["finbert_sentiment_score"]]
)

In [22]:
us_news = us_news.drop(columns = [ 'finbert_sentiment', 'finbert_pos_score', 'finbert_neu_score', 'finbert_neg_score', 'finbert_sentiment_score'])
us_news

Unnamed: 0,Headlines,Date,score
0,Bankers work around the clock to iron out EU f...,2018-01-02,0.901855
1,"Business live UK, US and eurozone manufacturi...",2018-01-02,-0.988718
2,The major indexes trade lower in January,2018-01-02,0.020964
3,UK construction industry optimism slumps to fi...,2018-01-03,0.011402
4,US blocks MoneyGram sale to Alibaba boss over ...,2018-01-04,0.067345
...,...,...,...
3995,White House adviser says China trade deal sign...,2019-12-31,-0.074281
3996,White House expecting agreement with China 'wi...,2019-12-31,-0.575454
3997,"Global stocks end 2019 near record highs, doll...",2019-12-31,-0.073071
3998,The 2010s: what just happened? \n\n\n It sho...,2019-12-31,0.600924


In [23]:
us_news.to_csv("data_scored/us_news_scored.csv", index=False)

### translate to chinese then score

In [24]:
from transformers import BertTokenizer, BertModel
import torch


model_path = "FinBERT_L-12_H-768_A-12_pytorch"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)

us_news_path = "data_final/us_news_to_cn.csv"
us_news_df = pd.read_csv(us_news_path)
us_texts = us_news_df["Headlines"].astype(str).tolist()

# Extract [CLS] vectors as sentence vectors
def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: (1, 768)
    return cls_embedding.squeeze().numpy()

# Apply to all news, track process with tqdm
embeddings = [get_cls_embedding(text) for text in tqdm(us_texts)]

embed_df = pd.DataFrame(embeddings)
embed_df.to_csv("data_final/us_to_cn_news_bert.csv", index=False, encoding='utf-8-sig')

100%|██████████| 4000/4000 [02:44<00:00, 24.39it/s]


In [25]:
import xgboost as xgb

us_news_bert = pd.read_csv("data_final/us_to_cn_news_bert.csv")
us_news = pd.read_csv("data_final/us_news.csv")

df_train_bert = pd.read_csv("sentiment_source/cn_train_bert.csv")
X_train = df_train_bert.drop(columns=["score"])
y_train = df_train_bert["score"]


# Fit XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
xgb_model.fit(X_train, y_train)

# Predict sentiment scores on financial news
xgb_scores = xgb_model.predict(us_news_bert)

# Normalize the scores to [-1, 1]
min_score, max_score = xgb_scores.min(), xgb_scores.max()
xgb_scores_norm = 2 * (xgb_scores - min_score) / (max_score - min_score) - 1 if max_score != min_score else np.zeros_like(xgb_scores)

# Combine with date and headlines
us_news["score"] = xgb_scores_norm
us_news

Unnamed: 0,Headlines,Date,score
0,Bankers work around the clock to iron out EU f...,2018-01-02,0.230518
1,"Business live UK, US and eurozone manufacturi...",2018-01-02,0.425864
2,The major indexes trade lower in January,2018-01-02,0.271628
3,UK construction industry optimism slumps to fi...,2018-01-03,0.012676
4,US blocks MoneyGram sale to Alibaba boss over ...,2018-01-04,-0.194586
...,...,...,...
3995,White House adviser says China trade deal sign...,2019-12-31,0.592355
3996,White House expecting agreement with China 'wi...,2019-12-31,0.413195
3997,"Global stocks end 2019 near record highs, doll...",2019-12-31,0.077357
3998,The 2010s: what just happened? \n\n\n It sho...,2019-12-31,0.362057


In [27]:
# us_news = us_news.drop(columns = ['Unnamed: 0'])
us_news.to_csv("data_scored/us_to_cn_scored.csv", encoding='utf-8-sig')