<a href="https://colab.research.google.com/github/JacquotQ/GDPR-compliance-with-Glass-Box/blob/main/violation_result.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install torch==2.1.0+cu118 torchvision==0.16.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
!pip install transformers==4.28.0 datasets==2.16.1
!pip install scikit-learn==1.2.2 seaborn==0.12.2 accelerate==0.24.1 imbalanced-learn==0.10.1
!pip install numpy==1.25.2 pandas==2.0.3
!pip install evaluate



Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118


In [2]:
from google.colab import drive

In [3]:

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# violation result

In [6]:
import os
import time
import json
import warnings
import logging
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

warnings.filterwarnings('ignore')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('Violationresult')

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class WeightedLossTrainer(Trainer):
    """
    一个自定义的 Trainer，用于在计算损失时应用类别权重。
    """
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # 将外部传入的 class_weights 保存为类的属性
        if class_weights is not None:
            logger.info(f"WeightedLossTrainer initialized with class weights: {class_weights}")
            self.class_weights = torch.tensor(class_weights, dtype=torch.float).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        重写 compute_loss 方法。
        """
        # 从模型的输入中分离出标签
        labels = inputs.pop("labels")
        # 正向传播，获取模型的原始输出 (logits)
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # 定义带权重的交叉熵损失函数
        # 如果 self.class_weights 存在，就使用它
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)

        # 计算损失
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

class TransformerClassifier:
    def __init__(self, model_name="JQ1984/legalbert_gdpr_pretrained", num_labels=None, output_dir="/content/drive/MyDrive/result", multi_label=False):
        self.model_name = model_name
        self.num_labels = num_labels
        self.output_dir = output_dir
        self.multi_label = multi_label
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.train_time = None # User's original attribute

        # Attributes from user's original prepare_data/load_external_test_data
        self.label_encoder = None
        self.kf = None # KFold object
        self.fold_datasets = [] # List of (train_dataset, val_dataset) tuples for K-Fold

        # Attributes needed for external test data handling by original methods and baseline
        self.X_test_external_original = None # List of text strings for external test set
        self.y_test_external = None # Numpy array of labels for external test set
        self.external_test_dataset = None # Hugging Face Dataset for external test set

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # For baseline model outputs (minimal addition for baseline methods to function)
        self.baseline_output_dir = os.path.join(self.output_dir, "baseline_model_results")
        if not os.path.exists(self.baseline_output_dir):
            os.makedirs(self.baseline_output_dir)
        self.baseline_model_last_fold = None # Stores the pipeline from the last fold of baseline CV


    def _features_to_text(self, df):
        texts = []
        for _, row in df.iterrows():
            text = ""

            # 原有的特征处理逻辑
            for col, val in row.items():
                if col == 'gdpr_clause' and isinstance(val, str):
                    clauses = [clause.strip() for clause in str(val).split(',')]
                    clause_text = " and ".join(clauses)
                    text += f"GDPR clauses are {clause_text}. "

                elif col == 'Date' and isinstance(val, str):
                    text += f"Date is {val}. "

                elif col in ['country', 'company_industry'] and isinstance(val, str):
                    text += f"{col} is {val}. "

                elif isinstance(val, (int, float)):
                    if val == 1:
                        feature_name = col.replace('_', ' ').lower()

                        if col in ['data_category_Children_data', 'data_category_Special_category_data']:
                            text += f"SENSITIVE DATA: {feature_name} is true. "

                        elif col in ['free_speech_exception', 'country_security_exception', 'Criminal_investigation_exception']:
                            text += f"EXCEPTION EXISTS: {feature_name} is true. "

                        else:
                            text += f"{feature_name} is true. "

                elif isinstance(val, str):
                    text += f"{col} is {val}. "

            texts.append(text)
        return texts

    def prepare_data(self, df, target_columns, n_splits=5, use_balance=True):
        """Prepare data for K-fold cross validation"""
        logger.info(f"Preparing dataset for {n_splits}-fold cross validation, target columns: {target_columns}")

        exclude_columns = ['fine_amount']
        exclude_columns.extend([col for col in df.columns if col.startswith('violation_nature_')])

        if 'gdpr_clause' in df.columns:
            exclude_columns.append('gdpr_clause')

        df_copy = df.drop(columns=exclude_columns, errors='ignore')
        logger.info(f"Excluded columns: {exclude_columns}")

        if not target_columns:
            raise ValueError("Target columns not specified. Ensure the dataset contains a 'violation_result' column or provide a custom list of target columns.")

        missing_cols = [col for col in target_columns if col not in df_copy.columns]
        if missing_cols:
            raise ValueError(f"The following target columns are missing from the dataset after exclusions: {missing_cols}")

        for col in df_copy.select_dtypes(include=['object']).columns:
            df_copy[col] = df_copy[col].fillna('')
        for col in df_copy.select_dtypes(include=['number']).columns:
            if col not in target_columns:
                df_copy[col] = df_copy[col].fillna(df_copy[col].median())

        target_col = target_columns[0]

        if df_copy[target_col].dtype == 'object':
            logger.info(f"Target column '{target_col}' is categorical, converting to numeric categories")
            le = LabelEncoder()
            df_copy[target_col] = le.fit_transform(df_copy[target_col])
            self.label_encoder = le
            logger.info(f"Category mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

        X = df_copy.drop(columns=target_columns)
        Y = df_copy[target_col].astype('int64')

        # 对所有分类特征进行编码（用于SMOTE）
        label_encoders = {}
        X_encoded = X.copy()
        for col in X.select_dtypes(include=['object']).columns:
            le = LabelEncoder()
            X_encoded[col] = le.fit_transform(X[col])
            label_encoders[col] = le
            logger.info(f"Encoded categorical column '{col}' for SMOTE")

        # Set self.num_labels if not already set or if it mismatches
        if self.num_labels is None or self.num_labels != len(Y.unique()):
            if self.num_labels is not None and self.num_labels != len(Y.unique()):
                logger.warning(f"Initialized num_labels ({self.num_labels}) does not match unique values in target ({len(Y.unique())}). Updating.")
            self.num_labels = len(Y.unique())

        logger.info(f"Feature count (after exclusions for text generation): {X.shape[1]}, Sample count: {X.shape[0]}")
        logger.info(f"Category count: {self.num_labels}, Category distribution: {Y.value_counts().to_dict()}")

        X_text = self._features_to_text(X)  # 使用原始X生成文本

        self.kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        self.fold_datasets = []

        for fold_idx, (train_idx, val_idx) in enumerate(self.kf.split(X_text)):
            if use_balance:
                # 使用编码后的数据进行SMOTE
                X_train_fold_encoded = X_encoded.iloc[train_idx]
                y_train_fold = Y.iloc[train_idx]

                logger.info(f"Fold {fold_idx + 1} - Original training distribution: {Counter(y_train_fold)}")
                smote = SMOTE(random_state=42)
                X_train_fold_balanced, y_train_fold_balanced = smote.fit_resample(X_train_fold_encoded, y_train_fold)
                logger.info(f"Fold {fold_idx + 1} - Balanced training distribution: {Counter(y_train_fold_balanced)}")

                # 将平衡后的编码数据转换回DataFrame格式
                X_train_fold_balanced_df = pd.DataFrame(X_train_fold_balanced, columns=X_encoded.columns)

                # 将编码的值转换回原始类别（用于文本生成）
                X_train_fold_decoded = X_train_fold_balanced_df.copy()
                for col, le in label_encoders.items():
                    X_train_fold_decoded[col] = le.inverse_transform(X_train_fold_balanced_df[col].astype(int))

                # 转换为文本
                X_train_fold_text = self._features_to_text(X_train_fold_decoded)
                y_train_fold = y_train_fold_balanced
            else:
                X_train_fold_text = [X_text[i] for i in train_idx]
                y_train_fold = Y.iloc[train_idx].values

            # 验证集保持不变
            X_val_fold_text = [X_text[i] for i in val_idx]
            y_val_fold = Y.iloc[val_idx].values

            train_dataset = Dataset.from_dict({'text': X_train_fold_text, 'labels': y_train_fold})
            val_dataset = Dataset.from_dict({'text': X_val_fold_text, 'labels': y_val_fold})
            self.fold_datasets.append((train_dataset, val_dataset))

        logger.info(f"Created {n_splits} folds for cross-validation stored in self.fold_datasets")
        return True


    def load_external_test_data(self, file_path, target_columns): # User's original method
        """Load external test dataset"""
        logger.info(f"Loading external test data from {file_path}")

        if file_path.endswith('.csv'):
            test_df = pd.read_csv(file_path, sep=';')
        else:
            raise ValueError("Unsupported file format")

        if 'Affected_data_volume' in test_df.columns:
            logger.info("Handling Affected_data_volume column")
            if test_df['Affected_data_volume'].dtype == 'object':
                test_df['Affected_data_volume'] = pd.to_numeric(
                    test_df['Affected_data_volume'].replace('unspecific', 0),
                    errors='coerce'
                ).fillna(0)

        for col in test_df.select_dtypes(include=['object']).columns:
            test_df[col] = test_df[col].fillna('')
        for col in test_df.select_dtypes(include=['number']).columns:
            if col not in target_columns:
                test_df[col] = test_df[col].fillna(test_df[col].median())

        target_col = target_columns[0]
        has_labels_in_file = False
        if target_col in test_df.columns:
            has_labels_in_file = True
            if hasattr(self, 'label_encoder') and self.label_encoder and test_df[target_col].dtype == 'object':
                # Fill NaNs before checking unique or transforming
                test_df[target_col] = test_df[target_col].fillna(self.label_encoder.classes_[0])
                unknown_categories = set(test_df[target_col].unique()) - set(self.label_encoder.classes_)
                if unknown_categories:
                    logger.warning(f"Unknown categories found in the test set: {unknown_categories}")
                    mode_category = self.label_encoder.classes_[0]
                    for cat in unknown_categories:
                        test_df.loc[test_df[target_col] == cat, target_col] = mode_category
                test_df[target_col] = self.label_encoder.transform(test_df[target_col])

            self.y_test_external = test_df[target_col].astype('int64').values
        else:
            self.y_test_external = None

        X_test_df = test_df.drop(columns=[target_col] if target_col in test_df.columns else [], errors='ignore')
        self.X_test_external_original = self._features_to_text(X_test_df)

        if has_labels_in_file and self.y_test_external is not None:
            self.external_test_dataset = Dataset.from_dict(
                {'text': self.X_test_external_original, 'labels': self.y_test_external}
            )
        else:
            self.external_test_dataset = Dataset.from_dict({'text': self.X_test_external_original})

        logger.info(f"External test size: {len(self.external_test_dataset)}")
        return True

    def load_model(self): # User's original method
        logger.info(f"Loading model: {self.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        if self.num_labels is None: # Ensure num_labels is set
            raise ValueError("self.num_labels has not been set. Run prepare_data first.")
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            problem_type="multi_label_classification" if self.multi_label else None
        )
        self.model.to(device)

    def tokenize_data(self, dataset, max_length=256): # User's original method
        """Tokenize a single dataset"""
        if not self.tokenizer: # Added check
            logger.error("Tokenizer not available. Load model first.")
            return None
        def tokenize_function(examples):
            return self.tokenizer(
                examples['text'],
                padding="max_length",
                truncation=True,
                max_length=max_length
            )
        return dataset.map(tokenize_function, batched=True)

 # In the TransformerClassifier class

    def train_and_evaluate_kfold(self, epochs=30, batch_size=16, learning_rate=3e-5, weight_decay=0.01, class_weights=None, early_stopping_patience: int = 5):
        """Train and evaluate using K-fold cross validation, but avoid multiple model loads"""
        logger.info("Starting K-fold cross validation training")

        if class_weights is not None:
            logger.info(f"Using class weights for training: {class_weights}")

        self.load_model()

        fold_results = []
        fold_accuracies = []
        all_fold_train_times = []

        # --- FIX START ---
        # Capture the label encoder in a local variable that compute_metrics can access.
        label_encoder_for_metrics = self.label_encoder
        # --- FIX END ---

        for fold, (train_dataset, val_dataset) in enumerate(self.fold_datasets):
            logger.info(f"Training fold {fold+1}/{len(self.fold_datasets)}")

            fold_output_dir = os.path.join(self.output_dir, f"fold_{fold+1}")
            os.makedirs(fold_output_dir, exist_ok=True)

            tokenized_train = self.tokenize_data(train_dataset)
            tokenized_val = self.tokenize_data(val_dataset)

            if tokenized_train is None or tokenized_val is None:
                # ... (rest of the error handling code is fine)
                continue

            if fold > 0:
                # ... (rest of the model re-initialization is fine)
                self.model = AutoModelForSequenceClassification.from_pretrained(
                    self.model_name,
                    num_labels=self.num_labels,
                    problem_type="multi_label_classification" if self.multi_label else None,
                    from_tf=False,
                    local_files_only=True
                ).to(device)


            training_args = TrainingArguments(
                output_dir=fold_output_dir,
                num_train_epochs=epochs,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                weight_decay=weight_decay,
                learning_rate=learning_rate,
                logging_dir=f"{fold_output_dir}/logs",
                evaluation_strategy="epoch",
                save_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                greater_is_better=True,
                push_to_hub=False,
                report_to="none"
            )

            def compute_metrics(eval_pred):
                logits, labels = eval_pred
                preds = np.argmax(logits, axis=1)

                precision, recall, f1, _ = precision_recall_fscore_support(
                    labels, preds, average='macro', zero_division=0
                )
                acc = accuracy_score(labels, preds)

                print("\n--- Classification Report for this evaluation step ---")
                # --- FIX START ---
                # Use the local variable instead of self.label_encoder
                target_names = label_encoder_for_metrics.classes_ if label_encoder_for_metrics is not None else None
                # --- FIX END ---
                print(classification_report(labels, preds, target_names=target_names, zero_division=0))
                print("------------------------------------------------------\n")

                return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

            current_trainer = WeightedLossTrainer(
                model=self.model,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_val,
                compute_metrics=compute_metrics,
                tokenizer=self.tokenizer,
                class_weights=class_weights
            )

            start_time_fold = time.time()
            current_trainer.train()
            # ... (the rest of your method is fine)
            current_fold_train_time = time.time() - start_time_fold
            all_fold_train_times.append(current_fold_train_time)


            eval_results = current_trainer.evaluate()
            fold_accuracy = eval_results['eval_accuracy']
            fold_accuracies.append(fold_accuracy)
            logger.info(f"Fold {fold+1} training time: {current_fold_train_time:.2f}s, Accuracy: {fold_accuracy:.4f}, F1: {eval_results.get('eval_f1', 0.0):.4f}")

            with open(os.path.join(fold_output_dir, 'eval_results.json'), 'w') as f:
                json.dump(eval_results, f)
            fold_results.append({
                'fold': fold+1,
                'eval_results': eval_results,
                'training_time': current_fold_train_time
            })
            current_trainer.save_model(os.path.join(fold_output_dir, "best_model"))
            if fold == len(self.fold_datasets) - 1:
                self.trainer = current_trainer

        # ... (the rest of your method is fine, including the final print statements and return)
        print("\n===== K-fold Cross Validation Accuracies =====")
        for i, acc in enumerate(fold_accuracies):
            print(f"Fold {i+1} Accuracy: {acc:.4f}")
        print("==============================================")

        avg_results = {
            'avg_accuracy': np.mean(fold_accuracies) if fold_accuracies else 0.0,
            'avg_f1': np.mean([res['eval_results'].get('eval_f1', 0.0) for res in fold_results if 'eval_results' in res]) if fold_results else 0.0,
            'avg_precision': np.mean([res['eval_results'].get('eval_precision', 0.0) for res in fold_results if 'eval_results' in res]) if fold_results else 0.0,
            'avg_recall': np.mean([res['eval_results'].get('eval_recall', 0.0) for res in fold_results if 'eval_results' in res]) if fold_results else 0.0,
            'avg_training_time': np.mean(all_fold_train_times) if all_fold_train_times else 0.0
        }
        with open(os.path.join(self.output_dir, 'avg_kfold_results.json'), 'w') as f:
            json.dump(avg_results, f)
        logger.info(f"K-fold cross validation complete. Average Accuracy: {avg_results['avg_accuracy']:.4f}, Average F1: {avg_results['avg_f1']:.4f}")
        return fold_results, avg_results, fold_accuracies

    def evaluate_external_test(self): # User's original method
        """Evaluate on the external test set using the last fold's model"""
        logger.info("Evaluating model on external test set")

        if not hasattr(self, 'external_test_dataset') or self.external_test_dataset is None:
            logger.error("No external test dataset loaded")
            return None

        if not self.trainer:
            logger.error("No trainer available. Run train_and_evaluate_kfold first.")
            return None

        tokenized_test = self.tokenize_data(self.external_test_dataset)
        if tokenized_test is None:
            logger.error("Tokenization of external test set failed.")
            return None

        # User's original logic for checking labels in external_test_dataset directly
        has_labels_in_dataset = 'labels' in self.external_test_dataset.features

        if has_labels_in_dataset and self.trainer: # Check based on dataset features
            test_predictions_output = self.trainer.predict(tokenized_test)
            preds = np.argmax(test_predictions_output.predictions, axis=1)

            # Use self.y_test_external for metrics as per user's original code,
            # assuming it's the ground truth for the external test set.
            if self.y_test_external is None:
                logger.error("self.y_test_external is None, cannot calculate external test metrics.")
                return {'predictions': preds} # Return predictions if no ground truth labels

            cm = confusion_matrix(self.y_test_external, preds)
            report_dict = classification_report(
                self.y_test_external,
                preds,
                output_dict=True,
                zero_division=0,
                target_names=self.label_encoder.classes_ if hasattr(self, 'label_encoder') and self.label_encoder else None
            )
            report_df = pd.DataFrame(report_dict).transpose()
            report_df.to_csv(os.path.join(self.output_dir, 'external_test_classification_report.csv'))

            print("\nExternal Test Set Classification Report:")
            print(report_df)

            return {
                'test_results': test_predictions_output.metrics,
                'confusion_matrix': cm,
                'classification_report': report_df
            }
        elif self.trainer:
            logger.info("External test data has no labels in dataset features or self.y_test_external is None. Generating predictions only.")
            test_predictions_output = self.trainer.predict(tokenized_test)
            preds = np.argmax(test_predictions_output.predictions, axis=1)
            pred_df = pd.DataFrame({'prediction': preds})
            pred_df.to_csv(os.path.join(self.output_dir, 'external_test_predictions.csv'), index=False)
            print("\nPredictions for external test set saved to 'external_test_predictions.csv'")
            return {'predictions': preds}
        else:
            logger.error("No trainer available for prediction.")
            return None

    # --- BASELINE MODEL METHODS START ---
    def train_and_evaluate_baseline_kfold(self): # Removed n_splits_param, will use len(self.fold_datasets)
        """Train and evaluate baseline model (TF-IDF + Logistic Regression) using K-fold cross validation."""
        logger.info("Starting K-fold cross validation for Baseline model")

        if not self.fold_datasets:
            logger.error("Fold datasets not prepared for baseline. Run prepare_data first.")
            return [], {}, []

        fold_results_baseline = []
        fold_accuracies_baseline = []
        all_train_times_baseline = []

        actual_n_splits = len(self.fold_datasets)
        if actual_n_splits == 0:
            logger.error("No fold datasets available for baseline K-fold.")
            return [], {}, []

        logger.info(f"Baseline K-fold will run for {actual_n_splits} folds.")

        for fold, (train_hf_dataset, val_hf_dataset) in enumerate(self.fold_datasets):
            logger.info(f"Training baseline fold {fold+1}/{actual_n_splits}")

            X_train_text = train_hf_dataset['text']
            y_train = np.array(train_hf_dataset['labels'])
            X_val_text = val_hf_dataset['text']
            y_val = np.array(val_hf_dataset['labels'])

            pipeline = make_pipeline(
                TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95),
                LogisticRegression(solver='liblinear', random_state=(42 + fold), C=1.0, class_weight='balanced')
            )

            start_time = time.time()
            try:
                pipeline.fit(X_train_text, y_train)
            except Exception as e:
                logger.error(f"Error fitting baseline pipeline for fold {fold+1}: {e}")
                fold_accuracies_baseline.append(0.0)
                fold_results_baseline.append({
                    'fold': fold+1,
                    'eval_results': {'eval_accuracy': 0.0, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0},
                    'training_time': 0, 'status': 'training_error'
                })
                continue

            train_time_fold = time.time() - start_time
            all_train_times_baseline.append(train_time_fold)

            preds_val = pipeline.predict(X_val_text)

            acc_val = accuracy_score(y_val, preds_val)
            # Storing full metrics for baseline as well, though user only asked for accuracy print
            precision_val, recall_val, f1_val, _ = precision_recall_fscore_support(y_val, preds_val, average='macro', zero_division=0)

            fold_accuracies_baseline.append(acc_val)
            eval_results_fold_baseline = {'eval_accuracy': acc_val, 'eval_f1': f1_val, 'eval_precision': precision_val, 'eval_recall': recall_val}

            logger.info(f"Baseline Fold {fold+1} training time: {train_time_fold:.2f}s, Accuracy: {acc_val:.4f}")

            fold_output_dir_bl = os.path.join(self.baseline_output_dir, f"fold_{fold+1}")
            os.makedirs(fold_output_dir_bl, exist_ok=True)
            with open(os.path.join(fold_output_dir_bl, 'eval_results_baseline.json'), 'w') as f:
                json.dump(eval_results_fold_baseline, f)

            fold_results_baseline.append({
                'fold': fold+1,
                'eval_results': eval_results_fold_baseline,
                'training_time': train_time_fold,
                'status': 'success'
            })

            if fold == actual_n_splits - 1:
                self.baseline_model_last_fold = pipeline
                logger.info(f"Saved baseline pipeline from fold {fold+1} to self.baseline_model_last_fold.")

        avg_accuracy_baseline = np.mean([acc for acc in fold_accuracies_baseline if isinstance(acc, float)]) if fold_accuracies_baseline else 0.0
        logger.info(f"Baseline K-fold cross validation complete. Average Accuracy: {avg_accuracy_baseline:.4f}")

        avg_results_baseline_dict = {
            'avg_accuracy': avg_accuracy_baseline,
            'avg_f1': np.mean([res['eval_results'].get('eval_f1', 0.0) for res in fold_results_baseline if res.get('status') == 'success' and 'eval_results' in res]) if fold_results_baseline else 0.0,
            'avg_precision': np.mean([res['eval_results'].get('eval_precision', 0.0) for res in fold_results_baseline if res.get('status') == 'success' and 'eval_results' in res]) if fold_results_baseline else 0.0,
            'avg_recall': np.mean([res['eval_results'].get('eval_recall', 0.0) for res in fold_results_baseline if res.get('status') == 'success' and 'eval_results' in res]) if fold_results_baseline else 0.0,
            'avg_training_time': np.mean(all_train_times_baseline) if all_train_times_baseline else 0.0
        }
        with open(os.path.join(self.baseline_output_dir, 'avg_kfold_results_baseline.json'), 'w') as f:
            json.dump(avg_results_baseline_dict, f)

        return fold_results_baseline, avg_results_baseline_dict, fold_accuracies_baseline

    def evaluate_baseline_external_test(self):
        """Evaluate the baseline model on the external test set."""
        logger.info("Evaluating Baseline model on external test set")
        if self.X_test_external_original is None:
            logger.error("No external test data text (X_test_external_original) available for Baseline.")
            return None # Return None or an empty dict
        if self.baseline_model_last_fold is None:
            logger.error("No Baseline model (pipeline) available from K-fold training (self.baseline_model_last_fold is None).")
            return None

        X_test_text = self.X_test_external_original # List of text strings

        # Check if external test labels are available
        if self.y_test_external is not None:
            logger.info("External test set has labels. Evaluating Baseline model with metrics.")
            try:
                preds_test_baseline = self.baseline_model_last_fold.predict(X_test_text)
            except Exception as e:
                logger.error(f"Error predicting with baseline model on external test data: {e}")
                return None

            acc_test = accuracy_score(self.y_test_external, preds_test_baseline)
            precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(self.y_test_external, preds_test_baseline, average='macro', zero_division=0)

            test_metrics_baseline = {
                'test_accuracy': acc_test,
                'test_f1': f1_test,
                'test_precision': precision_test,
                'test_recall': recall_test
            }
            cm_baseline = confusion_matrix(self.y_test_external, preds_test_baseline)
            target_names_for_report = None
            if hasattr(self, 'label_encoder') and self.label_encoder is not None:
                try:
                    # Ensure all label indices are within the range of label_encoder.classes_
                    max_label_idx = max(np.max(self.y_test_external), np.max(preds_test_baseline))
                    if max_label_idx < len(self.label_encoder.classes_):
                        target_names_for_report = self.label_encoder.classes_
                    else:
                        logger.warning(f"Max label index ({max_label_idx}) for baseline external test is out of bounds for label encoder classes (size {len(self.label_encoder.classes_)}). Reporting without target names.")
                except Exception as e_classes:
                    logger.warning(f"Could not determine target names for baseline classification report: {e_classes}")

            report_dict_baseline = classification_report(
                self.y_test_external, preds_test_baseline, output_dict=True, zero_division=0,
                target_names=target_names_for_report
            )
            report_df_baseline = pd.DataFrame(report_dict_baseline).transpose()
            report_df_baseline.to_csv(os.path.join(self.baseline_output_dir, 'baseline_external_test_classification_report.csv'))
            logger.info(f"\nBaseline External Test Set Classification Report:\n{report_df_baseline}")
            logger.info(f"Baseline External Test Metrics: {test_metrics_baseline}")
            return {'test_metrics': test_metrics_baseline, 'confusion_matrix': cm_baseline.tolist(), 'classification_report_df': report_df_baseline.to_dict()}
        else: # No labels for external test set
            logger.info("External test set has no labels. Generating predictions with Baseline model.")
            try:
                preds_test_baseline = self.baseline_model_last_fold.predict(X_test_text)
                pd.DataFrame({'prediction': preds_test_baseline}).to_csv(os.path.join(self.baseline_output_dir, 'baseline_external_test_predictions.csv'), index=False)
                logger.info("Baseline predictions for external test set saved.")
                return {'predictions': preds_test_baseline.tolist()}
            except Exception as e:
                logger.error(f"Error predicting with baseline model on external test data (no labels): {e}")
                return None


## Train

In [7]:
from collections import Counter

if __name__ == '__main__':
    # 定义文件路径
    train_file_path = '/content/drive/MyDrive/Thesis/FINALFI.csv'
    test_file_path = '/content/drive/MyDrive/Thesis/Testdataset.csv'

    # 加载训练数据
    try:
        df = pd.read_csv(train_file_path, sep=';')
    except FileNotFoundError:
        logger.error(f"Training file not found at {train_file_path}. Please check the path.")
        exit()
    except Exception as e:
        logger.error(f"Error reading training file: {e}")
        exit()

    target_columns = ['violation_result']

    # ==============================================================================
    #  修改部分 START: 计算权重并传入训练函数
    # ==============================================================================

    # 1. 计算类别权重
    #    我们使用原始的、未经过SMOTE处理的标签来计算权重
    logger.info("Calculating class weights for weighted loss...")
    le_for_weights = LabelEncoder()
    y_labels = le_for_weights.fit_transform(df[target_columns[0]])
    class_names = le_for_weights.classes_

    weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_labels),
        y=y_labels
    )
    logger.info(f"Detected Classes: {class_names}")
    logger.info(f"Calculated Class Weights (for [0, 1]): {weights}")

    # 2. 创建分类器实例
    #    修改输出目录，以防覆盖之前的结果
    classifier = TransformerClassifier(
        model_name="JQ1984/legalbert_gdpr_pretrained",
        output_dir="./violation_result_model_weighted", # <--- 修改了输出目录
        multi_label=False
    )

    # 3. 准备数据 (SMOTE可以继续使用，作为辅助)
    if not classifier.prepare_data(df, target_columns, n_splits=5, use_balance=True):
        logger.error("Data preparation failed. Exiting.")
        exit()

    # 4. 加载外部测试集
    try:
        if not classifier.load_external_test_data(test_file_path, target_columns):
            logger.warning("Loading external test data failed or file not found.")
    except FileNotFoundError:
        logger.warning(f"Test file not found at {test_file_path}. External evaluation will be skipped.")
    except Exception as e:
        logger.warning(f"Error loading external test data: {e}.")

    # 5. 调用训练函数，并传入计算好的权重
    logger.info("\n--- Starting Transformer Model K-Fold Cross Validation (with Weighted Loss) ---")
    transformer_fold_results, transformer_avg_results, transformer_fold_accuracies = classifier.train_and_evaluate_kfold(
        epochs=30,
        class_weights=weights # <--- 关键！将权重传入
    )

    # ==============================================================================
    #  修改部分 END
    # ==============================================================================

    # 打印 Transformer 的 K-fold 准确率
    print("\nTransformer K-fold Cross Validation Accuracies:")
    for i, acc in enumerate(transformer_fold_accuracies):
        print(f"Fold {i+1} Accuracy: {acc:.4f}")

    # 打印 Transformer 的平均结果
    print("\nAverage K-fold Cross Validation Results:")
    for metric, value in transformer_avg_results.items():
        print(f"{metric}: {value:.4f}")

    # 在外部测试集上评估 Transformer
    logger.info("\n--- Evaluating Transformer Model on External Test Set ---")
    transformer_test_results = classifier.evaluate_external_test()

    # (可选) 在外部测试集上评估基线模型
    logger.info("\n--- Evaluating Baseline Model on External Test Set ---")
    baseline_test_results = classifier.evaluate_baseline_external_test()
    if baseline_test_results and 'test_metrics' in baseline_test_results:
        logger.info(f"Baseline external test metrics: {baseline_test_results['test_metrics']}")

    print(f"\nTraining and evaluation complete. All results saved in {classifier.output_dir}")

Some weights of the model checkpoint at JQ1984/legalbert_gdpr_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JQ1984/legalbert_gdpr_pretrained and are newly initialized: ['classifier.weigh

Map:   0%|          | 0/3292 [00:00<?, ? examples/s]

Map:   0%|          | 0/483 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.971801,0.418219,0.404998,0.574609,0.624009
2,No log,0.764285,0.658385,0.572985,0.595259,0.683184
3,0.326100,0.706648,0.743271,0.622751,0.613979,0.686312
4,0.326100,0.866939,0.643892,0.565647,0.595411,0.686346
5,0.270400,0.72126,0.652174,0.573913,0.601132,0.697029
6,0.270400,0.897971,0.602484,0.542415,0.59594,0.691218
7,0.270400,0.759505,0.73499,0.61837,0.611783,0.687286
8,0.240000,0.914946,0.68323,0.592405,0.605142,0.697747
9,0.240000,0.889249,0.6853,0.583261,0.592625,0.669817
10,0.214000,0.825744,0.724638,0.617062,0.613944,0.698704



--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.19      0.92      0.32        71
           1       0.96      0.33      0.49       412

    accuracy                           0.42       483
   macro avg       0.57      0.62      0.40       483
weighted avg       0.85      0.42      0.47       483

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.26      0.72      0.38        71
           1       0.93      0.65      0.76       412

    accuracy                           0.66       483
   macro avg       0.60      0.68      0.57       483
weighted avg       0.83      0.66      0.71       483

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.32      0.54      0.40        71
           1       0.91      0.81      0.85       412

    accuracy                           0.77       483
   macro avg       0.62      0.67      0.63       483
weighted avg       0.82      0.77      0.79       483

------------------------------------------------------



Map:   0%|          | 0/3296 [00:00<?, ? examples/s]

Map:   0%|          | 0/483 [00:00<?, ? examples/s]

Some weights of the model checkpoint at JQ1984/legalbert_gdpr_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JQ1984/legalbert_gdpr_pretrained and are newly initialized: ['classifier.weigh

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.819948,0.320911,0.320419,0.577793,0.58874
2,No log,0.701161,0.63354,0.572897,0.614396,0.722219
3,0.342900,0.814204,0.639752,0.57623,0.613694,0.720247
4,0.342900,0.743631,0.652174,0.586084,0.617997,0.727564
5,0.287900,0.726556,0.643892,0.582667,0.62062,0.733946
6,0.287900,0.694634,0.641822,0.579457,0.617157,0.727097
7,0.287900,0.803222,0.536232,0.501787,0.599108,0.687421
8,0.266500,0.844696,0.666667,0.596005,0.620594,0.730471
9,0.266500,0.797051,0.741201,0.62101,0.612319,0.678667
10,0.244300,0.762808,0.652174,0.586084,0.617997,0.727564



--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.18      0.97      0.30        73
           1       0.98      0.20      0.34       410

    accuracy                           0.32       483
   macro avg       0.58      0.59      0.32       483
weighted avg       0.86      0.32      0.33       483

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.27      0.85      0.41        73
           1       0.96      0.60      0.73       410

    accuracy                           0.63       483
   macro avg       0.61      0.72      0.57       483
weighted avg       0.85      0.63      0.69       483

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.36      0.64      0.46        73
           1       0.93      0.80      0.86       410

    accuracy                           0.77       483
   macro avg       0.64      0.72      0.66       483
weighted avg       0.84      0.77      0.80       483

------------------------------------------------------



Map:   0%|          | 0/3304 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Some weights of the model checkpoint at JQ1984/legalbert_gdpr_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JQ1984/legalbert_gdpr_pretrained and are newly initialized: ['classifier.weigh

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.844897,0.508299,0.479702,0.586173,0.654654
2,No log,0.752456,0.560166,0.523324,0.605952,0.696137
3,0.323700,0.767687,0.626556,0.572063,0.616711,0.719504
4,0.323700,0.748179,0.651452,0.575393,0.598639,0.680808
5,0.265800,0.679164,0.711618,0.629426,0.632308,0.732564
6,0.265800,0.868667,0.676349,0.598719,0.613921,0.706281
7,0.265800,0.854037,0.711618,0.617598,0.617593,0.70048
8,0.237000,0.82241,0.736515,0.622367,0.61412,0.677826
9,0.237000,1.012507,0.728216,0.620256,0.614175,0.683595
10,0.214500,0.897421,0.761411,0.643755,0.630509,0.692604



--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.23      0.87      0.36        76
           1       0.95      0.44      0.60       406

    accuracy                           0.51       482
   macro avg       0.59      0.65      0.48       482
weighted avg       0.83      0.51      0.56       482

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.25      0.89      0.39        76
           1       0.96      0.50      0.66       406

    accuracy                           0.56       482
   macro avg       0.61      0.70      0.52       482
weighted avg       0.85      0.56      0.61       482

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.35      0.59      0.44        76
           1       0.91      0.79      0.85       406

    accuracy                           0.76       482
   macro avg       0.63      0.69      0.64       482
weighted avg       0.82      0.76      0.78       482

------------------------------------------------------



Map:   0%|          | 0/3274 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Some weights of the model checkpoint at JQ1984/legalbert_gdpr_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JQ1984/legalbert_gdpr_pretrained and are newly initialized: ['classifier.weigh

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.856426,0.609959,0.541714,0.600724,0.727659
2,No log,0.84566,0.60166,0.530186,0.589362,0.701881
3,0.321200,0.858932,0.603734,0.531737,0.589914,0.703068
4,0.321200,0.710513,0.692946,0.596123,0.61223,0.740119
5,0.271900,0.735728,0.707469,0.603292,0.61233,0.734415
6,0.271900,0.861183,0.628631,0.550455,0.596825,0.71732
7,0.271900,0.722136,0.676349,0.589205,0.614752,0.751645
8,0.238400,0.804106,0.665975,0.575005,0.602611,0.72468
9,0.238400,0.847339,0.690871,0.583178,0.597222,0.703886
10,0.211400,0.903727,0.721992,0.602848,0.604527,0.707683



--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.23      0.89      0.36        61
           1       0.97      0.57      0.72       421

    accuracy                           0.61       482
   macro avg       0.60      0.73      0.54       482
weighted avg       0.88      0.61      0.67       482

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.22      0.84      0.35        61
           1       0.96      0.57      0.71       421

    accuracy                           0.60       482
   macro avg       0.59      0.70      0.53       482
weighted avg       0.87      0.60      0.67       482

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.29      0.67      0.41        61
           1       0.94      0.77      0.85       421

    accuracy                           0.76       482
   macro avg       0.62      0.72      0.63       482
weighted avg       0.86      0.76      0.79       482

------------------------------------------------------



Map:   0%|          | 0/3298 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Some weights of the model checkpoint at JQ1984/legalbert_gdpr_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JQ1984/legalbert_gdpr_pretrained and are newly initialized: ['classifier.weigh

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.82949,0.558091,0.502594,0.568273,0.632699
2,No log,0.902772,0.622407,0.552939,0.59135,0.676223
3,0.336200,0.876002,0.634855,0.564267,0.598397,0.689185
4,0.336200,0.732806,0.697095,0.599431,0.60469,0.686472
5,0.274600,0.848453,0.684647,0.585207,0.593767,0.667884
6,0.274600,0.83404,0.684647,0.587403,0.596426,0.67351
7,0.274600,0.938219,0.676349,0.576545,0.587455,0.657367
8,0.254200,0.918245,0.599585,0.537173,0.58669,0.668403
9,0.254200,0.837834,0.721992,0.607902,0.604107,0.673008
10,0.229700,1.085899,0.715768,0.58994,0.587324,0.641206



--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.22      0.74      0.34        73
           1       0.92      0.53      0.67       409

    accuracy                           0.56       482
   macro avg       0.57      0.63      0.50       482
weighted avg       0.81      0.56      0.62       482

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.25      0.75      0.38        73
           1       0.93      0.60      0.73       409

    accuracy                           0.62       482
   macro avg       0.59      0.68      0.55       482
weighted avg       0.83      0.62      0.68       482

------------------------------------------------------


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.34      0.56      0.42        73
           1       0.91      0.80      0.85       409

    accuracy                           0.77       482
   macro avg       0.62      0.68      0.64       482
weighted avg       0.82      0.77      0.79       482

------------------------------------------------------


===== K-fold Cross Validation Accuracies =====
Fold 1 Accuracy: 0.7660
Fold 2 Accuracy: 0.7723
Fold 3 Accuracy: 0.7614
Fold 4 Accuracy: 0.7552
Fold 5 Accuracy: 0.7656

Transformer K-fold Cross Validation Accuracies:
Fold 1 Accuracy: 0.7660
Fold 2 Accuracy: 0.7723
Fold 3 Accuracy: 0.7614
Fold 4 Accuracy: 0.7552
Fold 5 Accuracy: 0.7656

Average K-fold Cross Validation Results:
avg_accuracy: 0.7641
avg_f1: 0.6390
avg_precision: 0.6261
avg_recall: 0.6968
avg_training_time: 4672.6800


Map:   0%|          | 0/100 [00:00<?, ? examples/s]


--- Classification Report for this evaluation step ---
              precision    recall  f1-score   support

           0       0.27      0.25      0.26        12
           1       0.90      0.91      0.90        88

    accuracy                           0.83       100
   macro avg       0.59      0.58      0.58       100
weighted avg       0.82      0.83      0.83       100

------------------------------------------------------



ERROR:Violationresult:No Baseline model (pipeline) available from K-fold training (self.baseline_model_last_fold is None).



External Test Set Classification Report:
              precision    recall  f1-score  support
0              0.272727  0.250000  0.260870    12.00
1              0.898876  0.909091  0.903955    88.00
accuracy       0.830000  0.830000  0.830000     0.83
macro avg      0.585802  0.579545  0.582412   100.00
weighted avg   0.823739  0.830000  0.826785   100.00

Training and evaluation complete. All results saved in ./violation_result_model_weighted
