In [None]:
import re

def extract_t5_results(output_file_path):
    """
    Extract T5 model results for LaTeX table - following the exact code architecture
    Filters out tqdm progress bars and other noise
    """
    
    with open(output_file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Clean content - remove tqdm progress bars and other noise
    lines = content.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Skip tqdm progress bars
        if any(pattern in line for pattern in ['%|', 'it/s', 'loss=', '██']):
            continue
        # Skip empty lines and lines with only whitespace
        if line.strip():
            cleaned_lines.append(line)
    
    cleaned_content = '\n'.join(cleaned_lines)
    
    print("=== BASELINE (NON-TRAINED) RESULTS ===")
    # From evaluate_non_trained_models function
    baseline_pattern = r'✅ (.*?) baseline results:\s+F1: ([0-9.]+)\s+Accuracy: ([0-9.]+)'
    baseline_matches = re.findall(baseline_pattern, cleaned_content)
    
    for model, f1, acc in baseline_matches:
        print(f"{model}: F1={f1}, Accuracy={acc}")
    
    print("\n=== 5-FOLD CROSS-VALIDATION RESULTS ===")
    # From the k-fold loop - final summary
    summary_pattern = r'📊 (.*?) - 5-Fold Summary:\s*F1 Scores: \[(.*?)\]\s*Mean F1: ([0-9.]+) ± ([0-9.]+)\s*Accuracy: \[(.*?)\]\s*Mean Accuracy: ([0-9.]+) ± ([0-9.]+)'
    summary_matches = re.findall(summary_pattern, cleaned_content, re.DOTALL)
    
    for model, f1_scores, f1_mean, f1_std, acc_scores, acc_mean, acc_std in summary_matches:
        print(f"{model}:")
        print(f"  F1: {f1_mean} ± {f1_std}")
        print(f"  Accuracy: {acc_mean} ± {acc_std}")
    
    print("\n=== STATISTICAL COMPARISON (5-FOLD) ===")
    # From the statistical analysis section
    stat_f1_pattern = r'--- F1 Score Comparison \(5-fold\) ---\s*(.*?) - Mean F1: ([0-9.]+) ± ([0-9.]+)\s*(.*?) - Mean F1: ([0-9.]+) ± ([0-9.]+)\s*Paired t-test \(5-fold\): t=([0-9.-]+), p=([0-9.]+)\s*Significance: (.*?)\s*Cohen\'s d \(effect size\): ([0-9.]+)'
    stat_match = re.search(stat_f1_pattern, cleaned_content, re.DOTALL)
    
    if stat_match:
        model1, f1_1, std1, model2, f1_2, std2, t_stat, p_val, sig, cohens_d = stat_match.groups()
        print(f"F1 Comparison:")
        print(f"  {model1.strip()}: {f1_1} ± {std1}")
        print(f"  {model2.strip()}: {f1_2} ± {std2}")
        print(f"  t-test: t={t_stat}, p={p_val}")
        print(f"  Significance: {sig.strip()}")
        print(f"  Cohen's d: {cohens_d}")
    
    print("\n=== IMPROVEMENT CALCULATIONS ===")
    # From baseline comparison section
    improvement_pattern = r'(mt5-[^:]+):\s*Baseline \(non-trained\): F1=([0-9.]+), Acc=([0-9.]+)\s*5-fold trained: F1=([0-9.]+), Acc=([0-9.]+)\s*Improvement: F1=([+-][0-9.]+), Acc=([+-][0-9.]+)'
    improvement_matches = re.findall(improvement_pattern, cleaned_content, re.IGNORECASE)
    
    for model, base_f1, base_acc, trained_f1, trained_acc, f1_imp, acc_imp in improvement_matches:
        print(f"{model}:")
        print(f"  Baseline: F1={base_f1}, Acc={base_acc}")
        print(f"  Trained: F1={trained_f1}, Acc={trained_acc}")
        print(f"  Improvement: F1={f1_imp}, Acc={acc_imp}")

# Usage:
# extract_t5_results('your_output_file.out')
# Usage:
extract_t5_results('/home/liorkob/M.Sc/thesis/citation-prediction/t5/t5_compre-5966530.out')

=== BASELINE (NON-TRAINED) RESULTS ===
mt5-mlm-final: F1=0.5097, Accuracy=0.3586
mt5-base: F1=0.5000, Accuracy=0.3333

=== 5-FOLD CROSS-VALIDATION RESULTS ===
LEGAL CITATION PREDICTION RESULTS:
F1 Score: 0.5097
Precision: 0.3424
Recall: 0.9966
Accuracy: 0.3586
Prediction Distribution: [ 23 847]
True Label Distribution: [579 291]
AUC-ROC: 0.5173
Classification Report:
              precision    recall  f1-score   support
           0       0.96      0.04      0.07       579
           1       0.34      1.00      0.51       291
    accuracy                           0.36       870
   macro avg       0.65      0.52      0.29       870
weighted avg       0.75      0.36      0.22       870
✅ mt5-mlm-final baseline results:
   F1: 0.5097
   Accuracy: 0.3586
   Threshold: 4.7980
📋 Evaluating base model: mt5-base
🔍 Finding best threshold...
Best threshold: 4.0205 (F1: 0.5000)
📊 LEGAL CITATION PREDICTION RESULTS:
F1 Score: 0.5000
Precision: 0.3337
Recall: 0.9966
Accuracy: 0.3333
Prediction Dist

: 

In [None]:
def extract_t5_results(output_file_path):
    """
    Extract T5 model results for LaTeX table - following the exact code architecture
    Filters out tqdm progress bars and other noise
    """
    
    with open(output_file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Clean content - remove tqdm progress bars and other noise
    lines = content.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Skip tqdm progress bars
        if any(pattern in line for pattern in ['%|', 'it/s', 'loss=', '██']):
            continue
        # Skip empty lines and lines with only whitespace
        if line.strip():
            cleaned_lines.append(line)
    
    cleaned_content = '\n'.join(cleaned_lines)
    print(cleaned_content)
extract_t5_results('/home/liorkob/M.Sc/thesis/citation-prediction/t5/t5_compre-5966530.out')

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Using device: cuda
Loading and combining datasets for k-fold cross-validation...
Total samples for k-fold CV: 5791
Legal Dataset created: 870 samples
Label distribution: [579 291]
Sample input length: 2293 characters
🔍 EVALUATING NON-TRAINED (BASE) MODELS
📋 Evaluating base model: mt5-mlm-final
🔍 Finding best threshold...
Best threshold: 4.7980 (F1: 0.5097)
📊 LEGAL CITATION PREDICTION RESULTS:
F1 Score: 0.5097
Precision: 0.3424
Recall: 0.9966
Acc

: 