In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
import time
import os
import matplotlib.pyplot as plt
import seaborn as sns
from opencc import OpenCC
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


### Data preprocessing 

In [3]:
data_path = 'data'
sm = SentenceTransformer('infgrad/stella-base-zh-v3-1792d')
# sm = SentenceTransformer("aspire/acge_text_embedding")

Some weights of BertModel were not initialized from the model checkpoint at infgrad/stella-base-zh-v3-1792d and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def sentence_embedding(sm, text):
    embedding = sm.encode(text)
    return embedding

In [23]:
for fold_csv in os.listdir(data_path):
    if fold_csv == 'CVAT_4_SD.csv':
        df = pd.read_csv(os.path.join('data', 'CVAT_4_SD.csv'), sep="\t", encoding="utf-8")
        continue
    cdf = pd.read_csv(os.path.join('data', fold_csv), sep="\t", encoding="utf-8")
    df = pd.concat([df, cdf])

df[df['Valence_SD']<1.3]

Unnamed: 0,No.,Text,Valence_Mean,Arousal_Mean,Valence_SD,Arousal_SD,Category
0,2761,呂興巨集患有老年癡呆的68歲鄖陽區老人呂興巨集，於5月22日上午從鄖陽區南化塘鎮走失後，一直...,2.889,5.000,0.567,1.414,news
1,1736,其次，不願罷免與馬總統的施政表現無直接相關，純粹只是不願意看到罷免案所引發的政治動盪，會讓臺...,3.500,5.143,0.500,0.639,political
2,1759,美國官員表示，德黑蘭的正面迴應，暗示今年美伊舉行會談的機會增加。,5.222,5.625,0.786,0.696,political
3,2755,彌月禮盒真的很多種類，光是挑選就眼花撩亂和老公光是吃一堆油飯和蛋糕就吃得好飽後來看到明月堂可...,6.375,4.857,0.484,1.457,news
4,2742,我非常佩服姚明的一點，不僅僅是他苦練球技和體能，用一場場比賽戰績證明自己，更是高超的情商和豁...,7.000,5.111,0.707,2.283,news
...,...,...,...,...,...,...,...
589,2009,可愛到爆的皮卡丘和傑尼龜、可達鴨等寶可夢一起在過去20多年裡征服的不僅是全世界的童心，還有大...,5.667,5.200,0.471,1.327,news
590,2024,5月7日，米家空調伴侶2新品開售，價格79元，讓家中傳統空調變智慧，比169元的1代便宜了一...,6.222,4.333,0.916,1.764,news
591,673,您的入住滿意就是對我們接待服務工作最好的肯定與激勵！,6.778,5.667,0.786,1.155,hotel
592,411,謝謝樓上兩位大哥的熱心回覆我已經去原廠詢問了原因就是戶外溫度低所以空調會自動調節車內溫度後來...,6.556,6.000,0.497,0.756,car


In [61]:
def data_processing(csv, sm):
    embeddings = []
    cc = OpenCC('t2s')
    df = pd.read_csv(os.path.join('data', csv), sep="\t", encoding="utf-8")
    for text in df['Text']:
        text = cc.convert(text)
        embedding = sentence_embedding(sm, text)
        embeddings.append(embedding)
    x = pd.DataFrame(embeddings)
    y_v = df[f"Valence_Mean"]
    y_a = df[f"Arousal_Mean"]
    return [x, y_v, y_a]

In [62]:
x_datasets = []
v_datasets = []
a_datasets = []
for fold_csv in os.listdir(data_path):
    print(f"\n----Processing {fold_csv}----\n")
    data = data_processing(fold_csv,sm)
    x_datasets.append(data[0])
    v_datasets.append(data[1])
    a_datasets.append(data[2])


----Processing CVAT_4_SD.csv----


----Processing CVAT_2_SD.csv----


----Processing CVAT_5_SD.csv----


----Processing CVAT_1_SD.csv----


----Processing CVAT_3_SD.csv----



### Modeling

In [78]:
def Fold_trainer(label):
    for fold_index in range(5):
        x_train = pd.concat([df for index, df in enumerate(x_datasets) if index != fold_index])
        x_test = x_datasets[fold_index]
        if label == 'v':
            y_train = pd.concat([df for index, df in enumerate(v_datasets) if index != fold_index])
            y_test = v_datasets[fold_index]
            train_dmatrix = xgb.DMatrix(x_train, label=y_train)
            test_dmatrix = xgb.DMatrix(x_test, label=y_test)
        else:
            y_train = pd.concat([df for index, df in enumerate(a_datasets) if index != fold_index])
            y_test = a_datasets[fold_index]
            train_dmatrix = xgb.DMatrix(x_train, label=y_train)
            test_dmatrix = xgb.DMatrix(x_test, label=y_test)
        if fold_index == 0:
            metric = 'mae'
            base_params = {
                    'objective': 'reg:squarederror',
                    'eval_metric': metric,
            }
            def objective(trial):
                params = {
                    'tree_method': trial.suggest_categorical('tree_method', ['approx', 'hist']),
                    'max_depth': trial.suggest_int('max_depth', 3, 12),
                    'min_child_weight': trial.suggest_int('min_child_weight', 1, 250),
                    'subsample': trial.suggest_float('subsample', 0.1, 1.0),
                    'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1.0),
                    'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 25, log=True),
                    'learning_rate': 0.3,
                }
                pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f'valid-{metric}')
                num_boost_round = 10000
                params.update(base_params)
                model = xgb.train(params, train_dmatrix, num_boost_round=num_boost_round,evals=[(train_dmatrix, 'train'), (test_dmatrix, 'valid')], early_stopping_rounds=50, verbose_eval=0,
                      callbacks=[pruning_callback])
                # predictions = model.predict(test_dmatrix)
                # mae = mean_absolute_error(y_test, predictions)
                trial.set_user_attr('best_iteration', model.best_iteration)
                return model.best_score
            
            sampler = optuna.samplers.TPESampler(seed=42)
            study = optuna.create_study(direction='minimize', sampler=sampler)
            tic = time.time()
            while time.time() - tic < 10:
                study.optimize(objective, n_trials=1)
                # print('Stage 1 ==============================')
                # print(f'best score = {study.best_trial.value}')
                # print('boosting params ---------------------------')
                # print(f'fixed learning rate: {0.3}')
                # print(f'best boosting round: {study.best_trial.user_attrs["best_iteration"]}')
                # print('best tree params --------------------------')
                for k, v in study.best_trial.params.items():
                    print(k, ':', v)
            # print("\nkjsfjljkvsfhbskljbskfgjbsflkbfhsklb\n")

            low_learning_rate = 0.01
            params = {}
            params.update(base_params)
            params.update(study.best_trial.params)
            params['learning_rate'] = low_learning_rate
            model_stage2 = xgb.train(params=params, dtrain=train_dmatrix, 
                                    num_boost_round=10000,
                                    evals=[(train_dmatrix, 'train'), (test_dmatrix, 'valid')],
                                    early_stopping_rounds=50,
                                    verbose_eval=0)
            print('=======================Fold 1 ==============================')
            print(f'best score = {mean_absolute_error(test_dmatrix.get_label(), model_stage2.predict(test_dmatrix))}')
            print('boosting params ---------------------------')
            print(f'fixed learning rate: {params["learning_rate"]}')
            print(f'best boosting round: {model_stage2.best_iteration}')
            continue
        model = xgb.train(params=params, dtrain=train_dmatrix, 
                                    num_boost_round=model_stage2.best_iteration,
                                    evals=[(train_dmatrix, 'train'), (test_dmatrix, 'valid')],
                                    early_stopping_rounds=50,
                                    verbose_eval=0)
        print(f'=========================Fold {fold_index+1} ==============================')
        print(f'best score = {mean_absolute_error(test_dmatrix.get_label(), model.predict(test_dmatrix))}')
        return params, model_stage2.best_iteration

In [73]:
def Train_full(label, params, num_boost_round):
    x_train = pd.concat([df for index, df in enumerate(x_datasets)])
    if label == 'v':
        y_train = pd.concat([df for index, df in enumerate(v_datasets)])
        train_dmatrix = xgb.DMatrix(x_train, label=y_train)
    else:
        y_train = pd.concat([df for index, df in enumerate(a_datasets)])
        train_dmatrix = xgb.DMatrix(x_train, label=y_train)
    model = xgb.train(params=params, dtrain=train_dmatrix, 
                                num_boost_round=num_boost_round,
                                early_stopping_rounds=50,
                                verbose_eval=0)
    return model

In [79]:
best_param, baest_iteration = Fold_trainer('v')
final_model = Train_full('v',best_param, baest_iteration )

[I 2024-05-09 18:17:50,948] A new study created in memory with name: no-name-114d7867-2fa7-4c99-a992-446c2d2e8ccc
[I 2024-05-09 18:17:51,945] Trial 0 finished with value: 0.6432997764963092 and parameters: {'tree_method': 'hist', 'max_depth': 10, 'min_child_weight': 150, 'subsample': 0.24041677639819287, 'colsample_bynode': 0.2403950683025824, 'reg_lambda': 0.001800728515054226}. Best is trial 0 with value: 0.6432997764963092.


tree_method : hist
max_depth : 10
min_child_weight : 150
subsample : 0.24041677639819287
colsample_bynode : 0.2403950683025824
reg_lambda : 0.001800728515054226


[I 2024-05-09 18:18:10,612] Trial 1 finished with value: 0.6407127711508009 and parameters: {'tree_method': 'approx', 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.9729188669457949, 'colsample_bynode': 0.8491983767203796, 'reg_lambda': 0.008587261143813469}. Best is trial 1 with value: 0.6407127711508009.


tree_method : approx
max_depth : 10
min_child_weight : 6
subsample : 0.9729188669457949
colsample_bynode : 0.8491983767203796
reg_lambda : 0.008587261143813469

kjsfjljkvsfhbskljbskfgjbsflkbfhsklb

best score = 0.5910483002662659
boosting params ---------------------------
fixed learning rate: 0.01
best boosting round: 2513


ValueError: Must have at least 1 validation dataset for early stopping.