# CN


In [2]:
import pandas as pd

In [3]:
cn_score_source = pd.read_excel('sentiment_source/cn_train_data.xlsx')
cn_score_source.info(), cn_score_source.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7337 entries, 0 to 7336
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7337 non-null   object
 1   score   7337 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 114.8+ KB


(None,
                                                 text  score
 0  苏州市物价局 您好您的来信收悉根据印发关于市区机动车停放服务收费改革意见的通知苏府号规定机动...      0
 1  触目惊心东北名男子街头上演武林风倒地男子被车碾死 月日凌晨时分大连市的太原街和同泰街交会出一...     -1
 2  以恋爱为名一女子诱骗网友到萍乡一传销窝点非法拘禁天 中国江西网讯徐政武见习记者薛柏武记者周再...     -1
 3  杭州芊颜化妆品有限公司被爆涉传老板很嚣张-浦口人网 杭州芊颜化妆品有限公司被爆涉传老板很嚣张...     -1
 4  蹊跷泰州一灵车停在屋里莫名起火把房子烧了麦泽利安的头颅-言 蹊跷泰州一灵车停在屋里莫名起火把...     -1)

### TF-IDF, Ridge/XGBoost

In [4]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import numpy as np

In [10]:
# Step 1: Chinese word segmentation
def tokenize(text):
    return " ".join(jieba.cut(text))

cn_score_source['text_tokenized'] = cn_score_source['text'].apply(tokenize)

# Step 2: TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(cn_score_source['text_tokenized'])
y = cn_score_source['score']

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train Ridge regression
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

# Step 5: Train XGBoost regression
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Step 6: Evaluate both models
def evaluate(y_true, y_pred):
    return {
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred)
    }

ridge_metrics = evaluate(y_test, y_pred_ridge)
xgb_metrics = evaluate(y_test, y_pred_xgb)

ridge_metrics, xgb_metrics

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/t5/m480qyfn2vnbgmm98d9vyd2m0000gn/T/jieba.cache
Loading model cost 0.335 seconds.
Prefix dict has been built successfully.


({'RMSE': 0.5069569343992312,
  'MAE': 0.3963599292140159,
  'R2': 0.3948572548864834},
 {'RMSE': 0.5174405119816862,
  'MAE': 0.42531049980135505,
  'R2': 0.36957046523674986})

### Chinese FinBert

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

model_path = "FinBERT_L-12_H-768_A-12_pytorch"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)
model.eval()

cn_train_texts = cn_score_source["text"].astype(str).tolist()

# Extract [CLS] vectors as sentence vectors
def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: (1, 768)
    return cls_embedding.squeeze().numpy()

# Apply to all news, track process with tqdm
embeddings = [get_cls_embedding(text) for text in tqdm(cn_train_texts)]

embed_df = pd.DataFrame(embeddings)
embed_df["score"] = cn_score_source["score"].values
embed_df.to_csv("sentiment_source/cn_train_bert.csv", index=False)

100%|██████████| 7337/7337 [06:01<00:00, 20.32it/s]


In [8]:
from transformers import BertTokenizer, BertModel
import torch

model_path = "FinBERT_L-12_H-768_A-12_pytorch"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)

cn_news_path = "data_final/cn_news.csv"
cn_news_df = pd.read_csv(cn_news_path)
cn_texts = cn_news_df["Headlines"].astype(str).tolist()

# Extract [CLS] vectors as sentence vectors
def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: (1, 768)
    return cls_embedding.squeeze().numpy()

# Apply to all news, track process with tqdm
embeddings = [get_cls_embedding(text) for text in tqdm(cn_texts)]

embed_df = pd.DataFrame(embeddings)
embed_df.to_csv("data_final/cn_news_bert.csv", index=False)

100%|██████████| 3578/3578 [02:38<00:00, 22.60it/s]


In [7]:
# Ridge, MLP, XGBoost for evaluation
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

df_train_bert = pd.read_csv("sentiment_source/cn_train_bert.csv")
X = df_train_bert.drop(columns=["score"])
y = df_train_bert["score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models = {
    "Ridge": Ridge(),
    "MLP": MLPRegressor(hidden_layer_sizes=(512, 128), max_iter=300, random_state=42),
    "XGboost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
}

# Train, predict and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results[name] = {
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "MAE": mean_absolute_error(y_test, preds),
        "R2": r2_score(y_test, preds)
    }

results


{'Ridge': {'RMSE': 0.4775876112290374,
  'MAE': 0.37125230181712104,
  'R2': 0.4629412456046471},
 'MLP': {'RMSE': 0.49274726626273363,
  'MAE': 0.364651839244405,
  'R2': 0.4283053313501518},
 'XGboost': {'RMSE': 0.4610354389982535,
  'MAE': 0.3459513115125476,
  'R2': 0.49952277909281206}}

### choose XGBoost

In [12]:
cn_news_bert = pd.read_csv("data_final/cn_news_bert.csv")
cn_news = pd.read_csv("data_final/cn_news.csv")

X_train = df_train_bert.drop(columns=["score"])
y_train = df_train_bert["score"]

# Fit XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
xgb_model.fit(X_train, y_train)

# Predict sentiment scores on financial news
xgb_scores = xgb_model.predict(cn_news_bert)

# Normalize the scores to [-1, 1]
min_score, max_score = xgb_scores.min(), xgb_scores.max()
xgb_scores_norm = 2 * (xgb_scores - min_score) / (max_score - min_score) - 1 if max_score != min_score else np.zeros_like(xgb_scores)

# Combine with date and headlines
cn_news["score"] = xgb_scores_norm
cn_news

Unnamed: 0.1,Unnamed: 0,Headlines,time,score
0,0,【2018年基金展望：公募弱冠重构江湖 私募借势探路国际】2018年是贯彻党的十九大精神的开...,2018-01-01,0.336547
1,1,【2017年基金排名新鲜出炉 价值投资者尽享“王者盛宴”】随着2017年最后一个交易日结束，...,2018-01-01,0.363028
2,2,【全年PMI增长平稳 中国经济换挡进入新阶段】近日，国家统计局服务业调查中心和中国物流与采购...,2018-01-01,0.356259
3,3,【四部委：新能源车免征购置税延至2020年】12月27日，财政部等四部委发布公告称，财政部、...,2018-01-01,0.002112
4,4,【新华社：2018年A股“高歌”开门红 资本市场发展新年将“更上层楼”】2018年第一个交易...,2018-01-02,0.468627
...,...,...,...,...
3573,3573,【中国人民银行行长易纲新年致辞】 原图,2019-12-31,0.091625
3574,3574,【人民银行行长易纲看望慰问外汇储备经营管理人员】网页链接,2019-12-31,-0.092145
3575,3575,【北向资金连续30日净流入 科技和医药股是今年两条增持主线】2019年是外资快速涌入A股市场...,2019-12-31,0.283359
3576,3576,【隔夜外盘】欧美股市普跌，道指跌超180点，纳指跌0.67%，失守9000点整数关口；蔚来收...,2019-12-31,-0.249491


In [None]:
cn_news = cn_news.drop(column = ['Unnamed: 0'])
cn_news.to_csv("data_final/cn_news_scored.csv", encoding='utf-8-sig')