In [3]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import plotly.express as px

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /home/b10173209/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/b10173209/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/b10173209/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [14]:
train_path = '/home/b10173209/test.file/train.txt'
test_path = '/home/b10173209/test.file/test.txt'
val_path = '/home/b10173209/test.file/val.txt'

train_data = pd.read_csv(train_path, sep=';', names=['text', 'label'])
test_data = pd.read_csv(test_path, sep=';', names=['text', 'label'])
val_data = pd.read_csv(val_path, sep=';', names=['text', 'label'])

# 確保標籤是數值型
train_data['label'] = train_data['label'].astype('category').cat.codes
test_data['label'] = test_data['label'].astype('category').cat.codes
val_data['label'] = val_data['label'].astype('category').cat.codes

# 合併訓練和驗證集
data = pd.concat([train_data, val_data], ignore_index=True)
train_labels = data['label']
test_labels = test_data['label']


In [23]:
# 使用 StratifiedKFold 進行分層交叉驗證
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# 定義參數網格
learning_rate = 0.1
max_depth = 9
n_estimators = 300

# Initialize results
all_validation_results = []

In [16]:
# 自定義 Tokenizer
def custom_tokenizer(text):
    tokens = jieba.lcut(text)
    return tokens

# 定義文本預處理函數
def preprocess_text(text):
    processed_text = text.lower()  # 將文本轉為小寫
    processed_text = re.sub(r'[^\w\s]', '', processed_text)  # 去除標點符號
    return processed_text

In [17]:
# 建立 TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=None)
X_train_full = vectorizer.fit_transform(data['text'])
X_test = vectorizer.transform(test_data['text'])

In [24]:
for fold, (train_index, val_index) in enumerate(skf.split(X_train_full, train_labels)):
    print(f"Fold: {fold + 1}/{num_folds}")

    # Split data
    X_train_fold = X_train_full[train_index]
    X_val_fold = X_train_full[val_index]
    train_labels_fold = train_labels.iloc[train_index]
    val_labels_fold = train_labels.iloc[val_index]


    # Initialize and train XGBoost classifier
    xgb_model = xgb.XGBClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        n_estimators=n_estimators,
        random_state=42
    )
    xgb_model.fit(X_train_fold, train_labels_fold)

    # Validation predictions
    predictions_val = xgb_model.predict(X_val_fold)

    # Test predictions
    predictions_test = xgb_model.predict(X_test)

    # Store results
    results = pd.DataFrame({
        'Fold': [fold + 1] * len(val_index),
        'id': val_index,  # Replace with actual IDs if available
        'Labels': val_labels_fold.values,
        'Predictions': predictions_val,
        'Text': data['text'].iloc[val_index].values
    })
    all_validation_results.append(results)

# Combine and save results
all_validation_results_df = pd.concat(all_validation_results, ignore_index=True)
excel_path = f'/home/b10173209/test.file/X/integrate_validation_results_lr_{learning_rate}_md_{max_depth}_ne_{n_estimators}.xlsx'
all_validation_results_df.to_excel(excel_path, index=False)
print(f"Validation results saved to: {excel_path}")

Fold: 1/5
Fold: 2/5
Fold: 3/5
Fold: 4/5
Fold: 5/5
Validation results saved to: /home/b10173209/test.file/X/integrate_validation_results_lr_0.1_md_9_ne_300.xlsx
