##ALBERT

In [None]:
"""
ALBERT Fine-tuning for NEDA/EDA Classification
Medical text classification using Hugging Face Transformers
"""

# ============================================================================
# STEP 1: Install Required Libraries
# ============================================================================
print("üì¶ Installing required libraries...")
!pip install -q transformers datasets accelerate openpyxl scikit-learn

üì¶ Installing required libraries...


In [None]:
# ============================================================================
# STEP 2: Import Libraries
# ============================================================================
print("üìö Importing libraries...")
import os
import pandas as pd
import numpy as np
import torch
from google.colab import files, userdata
from transformers import (
    AlbertTokenizer,
    AlbertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully!")
print(f"üî• PyTorch version: {torch.__version__}")
print(f"ü§ó Transformers library loaded")
print(f"üíª CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üéÆ GPU: {torch.cuda.get_device_name(0)}")



üìö Importing libraries...
‚úÖ All libraries imported successfully!
üî• PyTorch version: 2.8.0+cu126
ü§ó Transformers library loaded
üíª CUDA available: True
üéÆ GPU: NVIDIA A100-SXM4-40GB


In [None]:
# ============================================================================
# STEP 3: Get HuggingFace Token from Colab Secrets
# ============================================================================
print("\nüîë Retrieving HuggingFace token from secrets...")
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("‚úÖ HuggingFace token retrieved successfully!")

    # Login to HuggingFace
    from huggingface_hub import login
    login(token=HF_TOKEN)
    print("‚úÖ Logged in to HuggingFace Hub!")
except Exception as e:
    print(f"‚ö†Ô∏è Warning: Could not retrieve HF_TOKEN from secrets: {e}")
    print("You can continue without it, but won't be able to push to Hub")
    HF_TOKEN = None


üîë Retrieving HuggingFace token from secrets...
‚úÖ HuggingFace token retrieved successfully!
‚úÖ Logged in to HuggingFace Hub!


In [None]:
# ============================================================================
# STEP 4: Upload Excel Files
# ============================================================================
print("\nüìÅ Please upload your training Excel file...")
train_uploaded = files.upload()
train_filename = list(train_uploaded.keys())[0]
print(f"‚úÖ Training file uploaded: {train_filename}")

print("\nüìÅ Please upload your test Excel file...")
test_uploaded = files.upload()
test_filename = list(test_uploaded.keys())[0]
print(f"‚úÖ Test file uploaded: {test_filename}")



üìÅ Please upload your training Excel file...


Saving train.xlsx to train.xlsx
‚úÖ Training file uploaded: train.xlsx

üìÅ Please upload your test Excel file...


Saving test.xlsx to test.xlsx
‚úÖ Test file uploaded: test.xlsx


In [None]:
# ============================================================================
# STEP 5: Load and Prepare Data
# ============================================================================
print("\nüìä Loading data from Excel files...")
train_df = pd.read_excel(train_filename)
test_df = pd.read_excel(test_filename)

print(f"‚úÖ Training data shape: {train_df.shape}")
print(f"‚úÖ Test data shape: {test_df.shape}")
print(f"\nüìã Training data columns: {train_df.columns.tolist()}")
print(f"\nüîç First few rows of training data:")
print(train_df.head())




üìä Loading data from Excel files...
‚úÖ Training data shape: (6472, 3)
‚úÖ Test data shape: (3373, 3)

üìã Training data columns: ['MSC research database ID', 'input', 'output']

üîç First few rows of training data:
   MSC research database ID  \
0                         1   
1                         1   
2                         1   
3                         1   
4                         1   

                                               input  \
0  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
1  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
2  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
3  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
4  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   

                                        output  
0  After 7 months the patient will be in NEDA.  
1   After 7 months the patient will be in EDA.  
2   After 0 months the patient will be in EDA.  
3   After 4 months the patient will be in EDA.  
4  After 6

In [None]:
# ============================================================================
# STEP 6: Process Labels - Extract NEDA/EDA from Output
# ============================================================================
print("\nüè∑Ô∏è Processing labels...")

def extract_label(output_text):
    """Extract NEDA or EDA from output text"""
    output_text = str(output_text).upper()
    if 'NEDA' in output_text:
        return 'NEDA'
    elif 'EDA' in output_text:
        return 'EDA'
    else:
        return None

# Apply label extraction
train_df['label_text'] = train_df['output'].apply(extract_label)
test_df['label_text'] = test_df['output'].apply(extract_label)

# Remove rows with None labels
train_df = train_df[train_df['label_text'].notna()].reset_index(drop=True)
test_df = test_df[test_df['label_text'].notna()].reset_index(drop=True)

# Create label mapping
label2id = {'NEDA': 0, 'EDA': 1}
id2label = {0: 'NEDA', 1: 'EDA'}

# Convert to numeric labels
train_df['label'] = train_df['label_text'].map(label2id)
test_df['label'] = test_df['label_text'].map(label2id)

print(f"‚úÖ Labels extracted successfully!")
print(f"üìä Training set after label extraction: {len(train_df)} samples")
print(f"üìä Test set after label extraction: {len(test_df)} samples")
print(f"\nüìà Training label distribution:")
print(train_df['label_text'].value_counts())
print(f"\nüìà Test label distribution:")
print(test_df['label_text'].value_counts())



üè∑Ô∏è Processing labels...
‚úÖ Labels extracted successfully!
üìä Training set after label extraction: 6472 samples
üìä Test set after label extraction: 3373 samples

üìà Training label distribution:
label_text
NEDA    4989
EDA     1483
Name: count, dtype: int64

üìà Test label distribution:
label_text
NEDA    2630
EDA      743
Name: count, dtype: int64


In [None]:
# ============================================================================
# STEP 7: Initialize Tokenizer and Model
# ============================================================================
print("\nü§ñ Loading ALBERT model and tokenizer...")
MODEL_NAME = "albert-base-v2"

tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)
model = AlbertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

print(f"‚úÖ Model loaded: {MODEL_NAME}")
print(f"‚úÖ Number of parameters: {model.num_parameters():,}")




ü§ñ Loading ALBERT model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model loaded: albert-base-v2
‚úÖ Number of parameters: 11,685,122


In [None]:
# ============================================================================
# STEP 8: Tokenize Data
# ============================================================================
print("\nüî§ Tokenizing data...")

def tokenize_function(examples):
    """Tokenize input texts"""
    return tokenizer(
        examples['input'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[['input', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input', 'label']])

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"‚úÖ Training dataset: {len(train_dataset)} samples")
print(f"‚úÖ Test dataset: {len(test_dataset)} samples")


üî§ Tokenizing data...


Map:   0%|          | 0/6472 [00:00<?, ? examples/s]

Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

‚úÖ Training dataset: 6472 samples
‚úÖ Test dataset: 3373 samples


In [None]:

# ============================================================================
# STEP 9: Define Evaluation Metrics
# ============================================================================
print("\nüìä Setting up evaluation metrics...")

def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )

    # Confusion matrix
    cm = confusion_matrix(labels, predictions)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm.tolist()
    }

print("‚úÖ Evaluation metrics configured!")





üìä Setting up evaluation metrics...
‚úÖ Evaluation metrics configured!


In [None]:
# ============================================================================
# STEP 10: Setup Training Arguments
# ============================================================================
print("\n‚öôÔ∏è Configuring training arguments...")

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    report_to='none',  # Disable wandb/tensorboard
    push_to_hub=False,  # Set to True if you want to push to Hub
    optim='adamw_torch',  # Use AdamW optimizer (Adam with weight decay)
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
)

print("‚úÖ Training arguments configured!")
print(f"   - Learning rate: {training_args.learning_rate}")
print(f"   - Batch size: {training_args.per_device_train_batch_size}")
print(f"   - Epochs: {training_args.num_train_epochs}")
print(f"   - Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")




‚öôÔ∏è Configuring training arguments...
‚úÖ Training arguments configured!
   - Learning rate: 2e-05
   - Batch size: 8
   - Epochs: 30
   - Device: GPU


In [None]:
# ============================================================================
# STEP 11: Initialize Trainer
# ============================================================================
print("\nüéØ Initializing Trainer...")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("‚úÖ Trainer initialized successfully!")





üéØ Initializing Trainer...
‚úÖ Trainer initialized successfully!


In [None]:
# ============================================================================
# STEP 12: Train the Model
# ============================================================================
print("\nüöÄ Starting training...")
print("=" * 80)

train_result = trainer.train()

print("\n" + "=" * 80)
print("‚úÖ Training completed!")
print(f"üìä Training Loss: {train_result.training_loss:.4f}")
print(f"‚è±Ô∏è Training Time: {train_result.metrics['train_runtime']:.2f} seconds")



üöÄ Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Confusion Matrix
1,0.5365,0.529978,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
2,0.464,0.527031,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
3,0.5236,0.526867,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
4,0.5295,0.527289,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
5,0.5289,0.527028,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
6,0.6756,0.529052,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
7,0.5392,0.526671,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
8,0.5502,0.535423,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
9,0.5936,0.527373,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
10,0.5142,0.527905,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"



‚úÖ Training completed!
üìä Training Loss: 0.4914
‚è±Ô∏è Training Time: 5094.79 seconds


In [None]:
# ============================================================================
# STEP 13: Evaluate on Test Set
# ============================================================================
print("\nüìà Evaluating on test set...")

eval_results = trainer.evaluate()

print("\n" + "=" * 80)
print("üìä EVALUATION RESULTS")
print("=" * 80)
print(f"‚úÖ Accuracy:  {eval_results['eval_accuracy']:.4f}")
print(f"‚úÖ Precision: {eval_results['eval_precision']:.4f}")
print(f"‚úÖ Recall:    {eval_results['eval_recall']:.4f}")
print(f"‚úÖ F1 Score:  {eval_results['eval_f1']:.4f}")
print(f"\nüéØ Confusion Matrix:")
print(f"   {eval_results['eval_confusion_matrix']}")
print("=" * 80)





üìà Evaluating on test set...



üìä EVALUATION RESULTS
‚úÖ Accuracy:  0.7234
‚úÖ Precision: 0.3534
‚úÖ Recall:    0.3082
‚úÖ F1 Score:  0.3293

üéØ Confusion Matrix:
   [[2211, 419], [514, 229]]


In [None]:
# ============================================================================
# STEP 14: Save Model
# ============================================================================
print("\nüíæ Saving fine-tuned model...")

model_save_path = './albert_neda_eda_classifier'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"‚úÖ Model saved to: {model_save_path}")





üíæ Saving fine-tuned model...
‚úÖ Model saved to: ./albert_neda_eda_classifier


In [None]:
# ============================================================================
# STEP 15: Test Predictions on Sample Data
# ============================================================================
print("\nüß™ Testing predictions on sample data...")

# Get some test samples
test_samples = test_df.sample(min(5, len(test_df)))

print("\n" + "=" * 80)
print("üîÆ SAMPLE PREDICTIONS")
print("=" * 80)

for idx, row in test_samples.iterrows():
    # Tokenize input
    inputs = tokenizer(
        row['input'],
        return_tensors='pt',
        truncation=True,
        max_length=512
    ).to(model.device)

    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()

    predicted_label = id2label[predicted_class]
    true_label = row['label_text']
    confidence = predictions[0][predicted_class].item()

    print(f"\nüìù Input: {row['input'][:100]}...")
    print(f"‚úÖ True Label: {true_label}")
    print(f"üîÆ Predicted: {predicted_label} (Confidence: {confidence:.4f})")
    print(f"{'‚úÖ CORRECT' if predicted_label == true_label else '‚ùå INCORRECT'}")
    print("-" * 80)


üß™ Testing predictions on sample data...

üîÆ SAMPLE PREDICTIONS

üìù Input: Age:36.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation motor weakn...
‚úÖ True Label: NEDA
üîÆ Predicted: NEDA (Confidence: 0.8917)
‚úÖ CORRECT
--------------------------------------------------------------------------------

üìù Input: Age:34.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation motor weakn...
‚úÖ True Label: NEDA
üîÆ Predicted: EDA (Confidence: 0.8270)
‚ùå INCORRECT
--------------------------------------------------------------------------------

üìù Input: Age:48.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation motor weakn...
‚úÖ True Label: NEDA
üîÆ Predicted: NEDA (Confidence: 0.8917)
‚úÖ CORRECT
--------------------------------------------------------------------------------

üìù Input: Age:38.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation sensory, 

In [None]:
# ============================================================================
# STEP 16: Save Model to Google Drive
# ============================================================================
print("\nüíæ Saving fine-tuned model to Google Drive...")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define model save path in Google Drive
model_save_path = '/content/drive/MyDrive/ALBERT_finetunned_Lynn'

# Create the directory if it doesn't exist
import os
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
    print(f"Created directory: {model_save_path}")

# Save the model and tokenizer using the already initialized trainer and tokenizer
try:
    trainer.save_model(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"‚úÖ Model saved to: {model_save_path}")
except NameError:
    print("‚ö†Ô∏è Error: trainer or tokenizer objects not found. Please ensure previous steps were executed.")

##TinyBert

In [None]:
"""
TinyBERT Fine-tuning for NEDA/EDA Classification
Medical text classification using Hugging Face Transformers
"""

# ============================================================================
# STEP 1: Install Required Libraries
# ============================================================================
print("üì¶ Installing required libraries...")
!pip install -q transformers datasets accelerate openpyxl scikit-learn huggingface_hub

# ============================================================================
# STEP 2: Import Libraries
# ============================================================================
print("üìö Importing libraries...")
import os
import pandas as pd
import numpy as np
import torch
from google.colab import files, userdata
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully!")
print(f"üî• PyTorch version: {torch.__version__}")
print(f"ü§ó Transformers library loaded")
print(f"üíª CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üéÆ GPU: {torch.cuda.get_device_name(0)}")

# ============================================================================
# STEP 3: Get HuggingFace Token from Colab Secrets (optional)
# ============================================================================
print("\nüîë Retrieving HuggingFace token from secrets...")
try:
    from huggingface_hub import login
    HF_TOKEN = userdata.get('HF_TOKEN')
    if HF_TOKEN:
        login(token=HF_TOKEN)
        print("‚úÖ Logged in to HuggingFace Hub!")
    else:
        print("‚ÑπÔ∏è No HF_TOKEN found in Colab secrets (that's okay).")
except Exception as e:
    print(f"‚ö†Ô∏è Warning: Could not retrieve HF_TOKEN from secrets: {e}")
    HF_TOKEN = None

# ============================================================================
# STEP 4: Upload Excel Files
# ============================================================================
print("\nüìÅ Please upload your training Excel file...")
train_uploaded = files.upload()
train_filename = list(train_uploaded.keys())[0]
print(f"‚úÖ Training file uploaded: {train_filename}")

print("\nüìÅ Please upload your test Excel file...")
test_uploaded = files.upload()
test_filename = list(test_uploaded.keys())[0]
print(f"‚úÖ Test file uploaded: {test_filename}")

# ============================================================================
# STEP 5: Load and Prepare Data
# ============================================================================
print("\nüìä Loading data from Excel files...")
train_df = pd.read_excel(train_filename)
test_df = pd.read_excel(test_filename)

print(f"‚úÖ Training data shape: {train_df.shape}")
print(f"‚úÖ Test data shape: {test_df.shape}")
print(f"\nüìã Training data columns: {train_df.columns.tolist()}")
print(f"\nüîç First few rows of training data:")
print(train_df.head())

# Expecting columns: 'input' (text) and 'output' (string containing 'NEDA' or 'EDA')

# ============================================================================
# STEP 6: Process Labels - Extract NEDA/EDA from Output
# ============================================================================
print("\nüè∑Ô∏è Processing labels...")

def extract_label(output_text):
    """Extract NEDA or EDA from output text"""
    output_text = str(output_text).upper()
    if 'NEDA' in output_text:
        return 'NEDA'
    elif 'EDA' in output_text:
        return 'EDA'
    else:
        return None

# Apply label extraction
train_df['label_text'] = train_df['output'].apply(extract_label)
test_df['label_text'] = test_df['output'].apply(extract_label)

# Remove rows with None labels
train_df = train_df[train_df['label_text'].notna()].reset_index(drop=True)
test_df = test_df[test_df['label_text'].notna()].reset_index(drop=True)

# Create label mapping
label2id = {'NEDA': 0, 'EDA': 1}
id2label = {0: 'NEDA', 1: 'EDA'}

# Convert to numeric labels
train_df['label'] = train_df['label_text'].map(label2id)
test_df['label'] = test_df['label_text'].map(label2id)

print(f"‚úÖ Labels extracted successfully!")
print(f"üìä Training set after label extraction: {len(train_df)} samples")
print(f"üìä Test set after label extraction: {len(test_df)} samples")
print(f"\nüìà Training label distribution:")
print(train_df['label_text'].value_counts())
print(f"\nüìà Test label distribution:")
print(test_df['label_text'].value_counts())

# ============================================================================
# STEP 7: Initialize Tokenizer and Model (TinyBERT)
# ============================================================================
print("\nü§ñ Loading TinyBERT model and tokenizer...")

# Common TinyBERT options (pick one):
#   "huawei-noah/TinyBERT_General_4L_312D"  -> very small (fastest)
#   "huawei-noah/TinyBERT_General_6L_768D"  -> bigger (better quality)
MODEL_NAME = "huawei-noah/TinyBERT_General_6L_768D"

tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# Count parameters
try:
    n_params = model.num_parameters()
except:
    n_params = sum(p.numel() for p in model.parameters())

print(f"‚úÖ Model loaded: {MODEL_NAME}")
print(f"‚úÖ Number of parameters: {n_params:,}")

# ============================================================================
# STEP 8: Tokenize Data
# ============================================================================
print("\nüî§ Tokenizing data...")

def tokenize_function(examples):
    """Tokenize input texts"""
    return tokenizer(
        examples['input'],
        padding='max_length',
        truncation=True,
        max_length=256  # TinyBERT-friendly; change to 512 if needed
    )

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[['input', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input', 'label']])

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"‚úÖ Training dataset: {len(train_dataset)} samples")
print(f"‚úÖ Test dataset: {len(test_dataset)} samples")

# ============================================================================
# STEP 9: Define Evaluation Metrics
# ============================================================================
print("\nüìä Setting up evaluation metrics...")

def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary', zero_division=0
    )
    cm = confusion_matrix(labels, predictions)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm.tolist()
    }

print("‚úÖ Evaluation metrics configured!")

# ============================================================================
# STEP 10: Setup Training Arguments
# ============================================================================
print("\n‚öôÔ∏è Configuring training arguments...")

seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    report_to='none',  # Disable wandb/tensorboard
    push_to_hub=False,  # Set to True if you want to push to Hub
    optim='adamw_torch',  # Use AdamW optimizer (Adam with weight decay)
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
)

print("‚úÖ Training arguments configured!")
print(f"   - Learning rate: {training_args.learning_rate}")
print(f"   - Batch size: {training_args.per_device_train_batch_size}")
print(f"   - Epochs: {training_args.num_train_epochs}")
print(f"   - Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

# ============================================================================
# STEP 11: Initialize Trainer
# ============================================================================
print("\nüéØ Initializing Trainer...")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("‚úÖ Trainer initialized successfully!")

# ============================================================================
# STEP 12: Train the Model
# ============================================================================
print("\nüöÄ Starting training...")
print("=" * 80)

train_result = trainer.train()

print("\n" + "=" * 80)
print("‚úÖ Training completed!")
print(f"üìä Training Loss: {train_result.training_loss:.4f}" if hasattr(train_result, 'training_loss') else "üìä Training done.")
if 'train_runtime' in train_result.metrics:
    print(f"‚è±Ô∏è Training Time: {train_result.metrics['train_runtime']:.2f} seconds")

# ============================================================================
# STEP 13: Evaluate on Test Set
# ============================================================================
print("\nüìà Evaluating on test set...")

eval_results = trainer.evaluate()

print("\n" + "=" * 80)
print("üìä EVALUATION RESULTS")
print("=" * 80)
print(f"‚úÖ Accuracy:  {eval_results.get('eval_accuracy', 0):.4f}")
print(f"‚úÖ Precision: {eval_results.get('eval_precision', 0):.4f}")
print(f"‚úÖ Recall:    {eval_results.get('eval_recall', 0):.4f}")
print(f"‚úÖ F1 Score:  {eval_results.get('eval_f1', 0):.4f}")
print(f"\nüéØ Confusion Matrix:")
print(f"   {eval_results.get('eval_confusion_matrix', [])}")
print("=" * 80)

# ============================================================================
# STEP 14: Save Model
# ============================================================================
print("\nüíæ Saving fine-tuned model...")

model_save_path = './tinybert_neda_eda_classifier'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"‚úÖ Model saved to: {model_save_path}")

# ============================================================================
# STEP 15: Test Predictions on Sample Data
# ============================================================================
print("\nüß™ Testing predictions on sample data...")

# Get some test samples
test_samples = test_df.sample(min(5, len(test_df)), random_state=seed)

print("\n" + "=" * 80)
print("üîÆ SAMPLE PREDICTIONS")
print("=" * 80)

model.eval()
for _, row in test_samples.iterrows():
    inputs = tokenizer(
        row['input'],
        return_tensors='pt',
        truncation=True,
        max_length=256
    ).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()

    predicted_label = id2label[pred_id]
    true_label = row['label_text']
    confidence = probs[0][pred_id].item()

    print(f"\nüìù Input: {row['input'][:100]}...")
    print(f"‚úÖ True Label: {true_label}")
    print(f"üîÆ Predicted: {predicted_label} (Confidence: {confidence:.4f})")
    print(f"{'‚úÖ CORRECT' if predicted_label == true_label else '‚ùå INCORRECT'}")
    print("-" * 80)

# ============================================================================
# STEP 16: Optional - Push to HuggingFace Hub
# ============================================================================
if HF_TOKEN:
    print("\nü§ó Would you like to push the model to HuggingFace Hub? (y/n)")
    push_choice = input().lower().strip()

    if push_choice == 'y':
        print("üìù Enter your HuggingFace username:")
        username = input().strip()
        print("üìù Enter model name (e.g., tinybert-neda-eda-classifier):")
        model_name = input().strip()

        repo_name = f"{username}/{model_name}"
        try:
            print(f"\n‚¨ÜÔ∏è Pushing model to {repo_name}...")
            trainer.push_to_hub(repo_name)
            print(f"‚úÖ Model successfully pushed to https://huggingface.co/{repo_name}")
        except Exception as e:
            print(f"‚ùå Error pushing to Hub: {e}")
else:
    print("\n‚ö†Ô∏è Skipping HuggingFace Hub push (no token available)")

# ============================================================================
# STEP 17: Download Trained Model
# ============================================================================
print("\nüíæ Would you like to download the trained model? (y/n)")
download_choice = input().lower().strip()

if download_choice == 'y':
    print("\nüì¶ Creating zip file...")
    !zip -r tinybert_neda_eda_classifier.zip {model_save_path}
    print("‚¨áÔ∏è Downloading model...")
    files.download('tinybert_neda_eda_classifier.zip')
    print("‚úÖ Model downloaded successfully!")

print("\n" + "=" * 80)
print("üéâ ALL DONE!")
print("=" * 80)
print("‚úÖ Model training completed successfully!")
print("‚úÖ Model evaluated on test set")
print("‚úÖ Model saved locally")
print("\nüìå Next steps:")
print("   1. Review the evaluation metrics above")
print("   2. Test the model with your own inputs")
print("   3. Fine-tune hyperparameters if needed")
print("   4. Deploy the model for inference")
print("=" * 80)


üì¶ Installing required libraries...
üìö Importing libraries...
‚úÖ All libraries imported successfully!
üî• PyTorch version: 2.8.0+cu126
ü§ó Transformers library loaded
üíª CUDA available: True
üéÆ GPU: NVIDIA A100-SXM4-40GB

üîë Retrieving HuggingFace token from secrets...
‚úÖ Logged in to HuggingFace Hub!

üìÅ Please upload your training Excel file...


Saving train.xlsx to train (1).xlsx
‚úÖ Training file uploaded: train (1).xlsx

üìÅ Please upload your test Excel file...


Saving test.xlsx to test (1).xlsx
‚úÖ Test file uploaded: test (1).xlsx

üìä Loading data from Excel files...
‚úÖ Training data shape: (6472, 3)
‚úÖ Test data shape: (3373, 3)

üìã Training data columns: ['MSC research database ID', 'input', 'output']

üîç First few rows of training data:
   MSC research database ID  \
0                         1   
1                         1   
2                         1   
3                         1   
4                         1   

                                               input  \
0  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
1  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
2  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
3  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
4  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   

                                        output  
0  After 7 months the patient will be in NEDA.  
1   After 7 months the patient will be in EDA.  
2   After 0 months the patient will 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_6L_768D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model loaded: huawei-noah/TinyBERT_General_6L_768D
‚úÖ Number of parameters: 66,956,546

üî§ Tokenizing data...


Map:   0%|          | 0/6472 [00:00<?, ? examples/s]

Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

‚úÖ Training dataset: 6472 samples
‚úÖ Test dataset: 3373 samples

üìä Setting up evaluation metrics...
‚úÖ Evaluation metrics configured!

‚öôÔ∏è Configuring training arguments...
‚úÖ Training arguments configured!
   - Learning rate: 2e-05
   - Batch size: 8
   - Epochs: 30
   - Device: GPU

üéØ Initializing Trainer...
‚úÖ Trainer initialized successfully!

üöÄ Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Confusion Matrix
1,0.5239,0.518769,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
2,0.4772,0.526306,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
3,0.5069,0.514623,0.779721,0.0,0.0,0.0,"[[2630, 0], [743, 0]]"
4,0.5227,0.517905,0.778832,0.459459,0.02288,0.04359,"[[2610, 20], [726, 17]]"
5,0.5609,0.518197,0.775274,0.409639,0.04576,0.082324,"[[2581, 49], [709, 34]]"
6,0.707,0.515361,0.774088,0.434483,0.084791,0.141892,"[[2548, 82], [680, 63]]"
7,0.4922,0.518236,0.778239,0.475248,0.064603,0.113744,"[[2577, 53], [695, 48]]"
8,0.522,0.511904,0.780611,0.527273,0.039031,0.072682,"[[2604, 26], [714, 29]]"
9,0.5444,0.522516,0.780018,0.509804,0.034993,0.065491,"[[2605, 25], [717, 26]]"
10,0.4913,0.52776,0.774385,0.4,0.048452,0.086435,"[[2576, 54], [707, 36]]"



‚úÖ Training completed!
üìä Training Loss: 0.4529
‚è±Ô∏è Training Time: 1417.84 seconds

üìà Evaluating on test set...



üìä EVALUATION RESULTS
‚úÖ Accuracy:  0.7139
‚úÖ Precision: 0.3080
‚úÖ Recall:    0.2396
‚úÖ F1 Score:  0.2695

üéØ Confusion Matrix:
   [[2230, 400], [565, 178]]

üíæ Saving fine-tuned model...
‚úÖ Model saved to: ./tinybert_neda_eda_classifier

üß™ Testing predictions on sample data...

üîÆ SAMPLE PREDICTIONS

üìù Input: Age:36.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation motor weakn...
‚úÖ True Label: NEDA
üîÆ Predicted: NEDA (Confidence: 0.9300)
‚úÖ CORRECT
--------------------------------------------------------------------------------

üìù Input: Age:34.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation motor weakn...
‚úÖ True Label: NEDA
üîÆ Predicted: EDA (Confidence: 0.8573)
‚ùå INCORRECT
--------------------------------------------------------------------------------

üìù Input: Age:48.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation motor weakn...
‚úÖ True Label: N

##funnel transformer

In [None]:

# ============================================================================
# STEP 1: Install Required Libraries
# ============================================================================
print("üì¶ Installing required libraries...")
!pip install -q transformers datasets openpyxl scikit-learn huggingface_hub accelerate


üì¶ Installing required libraries...


In [None]:

# ============================================================================
# STEP 2: Import Libraries
# ============================================================================
print("\nüìö Importing libraries...")

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from google.colab import files, userdata
from transformers import (
    FunnelTokenizer,
    FunnelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from huggingface_hub import login
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")



üìö Importing libraries...
‚úÖ Libraries imported successfully!


In [None]:
# ============================================================================
# STEP 3: Login to Hugging Face (Optional but Recommended)
# ============================================================================
print("\nüîê Logging into Hugging Face...")

try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    login(token=HF_TOKEN)
    print("‚úÖ Successfully logged into Hugging Face!")
except Exception as e:
    print(f"‚ö†Ô∏è Could not login to HF: {e}")
    print("Continuing without HF login...")


üîê Logging into Hugging Face...
‚úÖ Successfully logged into Hugging Face!


In [None]:
# ============================================================================
# STEP 4: Upload Training and Testing Files
# ============================================================================
print("\nüìÅ Please upload your Excel files...")
print("Expected format: 'input' column (text) and 'output' column (labels)")

# Check if files already exist
if not os.path.exists('/content/train.xlsx'):
    print("\nüì§ Upload TRAINING file (train.xlsx):")
    train_uploaded = files.upload()
else:
    print("‚úÖ Training file already exists!")

if not os.path.exists('/content/test.xlsx'):
    print("\nüì§ Upload TESTING file (test.xlsx):")
    test_uploaded = files.upload()
else:
    print("‚úÖ Testing file already exists!")



üìÅ Please upload your Excel files...
Expected format: 'input' column (text) and 'output' column (labels)

üì§ Upload TRAINING file (train.xlsx):


Saving train.xlsx to train.xlsx

üì§ Upload TESTING file (test.xlsx):


Saving test.xlsx to test.xlsx


In [None]:
# ============================================================================
# STEP 5: Load Data from Excel Files
# ============================================================================
print("\nüìä Loading data from Excel files...")

train_df = pd.read_excel('/content/train.xlsx')
test_df = pd.read_excel('/content/test.xlsx')

print(f"‚úÖ Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
print(f"‚úÖ Testing data loaded: {test_df.shape[0]} rows, {test_df.shape[1]} columns")

# Preview data
print("\nüìã Training data preview:")
print(train_df.head())
print("\nüìã Testing data preview:")
print(test_df.head())


üìä Loading data from Excel files...
‚úÖ Training data loaded: 6472 rows, 3 columns
‚úÖ Testing data loaded: 3373 rows, 3 columns

üìã Training data preview:
   MSC research database ID  \
0                         1   
1                         1   
2                         1   
3                         1   
4                         1   

                                               input  \
0  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
1  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
2  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
3  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
4  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   

                                        output  
0  After 7 months the patient will be in NEDA.  
1   After 7 months the patient will be in EDA.  
2   After 0 months the patient will be in EDA.  
3   After 4 months the patient will be in EDA.  
4  After 6 months the patient will be in NEDA.  

üìã Testing data pr

In [None]:
# ============================================================================
# STEP 6: Process Labels
# ============================================================================
print("\nüè∑Ô∏è Processing labels...")

def extract_label(output_text):
    """Extract NEDA or EDA from output text"""
    output_text = str(output_text).upper()
    if 'NEDA' in output_text:
        return 'NEDA'
    elif 'EDA' in output_text:
        return 'EDA'
    else:
        return None

# Apply label extraction
train_df['label_text'] = train_df['output'].apply(extract_label)
test_df['label_text'] = test_df['output'].apply(extract_label)

# Remove rows with None labels
train_df = train_df[train_df['label_text'].notna()].reset_index(drop=True)
test_df = test_df[test_df['label_text'].notna()].reset_index(drop=True)

# Create label mapping
label2id = {'NEDA': 0, 'EDA': 1}
id2label = {0: 'NEDA', 1: 'EDA'}

# Convert to numeric labels
train_df['label'] = train_df['label_text'].map(label2id)
test_df['label'] = test_df['label_text'].map(label2id)

print(f"‚úÖ Labels extracted successfully!")
print(f"\nüìä Training set: {len(train_df)} samples")
print(f"   - NEDA: {sum(train_df['label']==0)} ({sum(train_df['label']==0)/len(train_df)*100:.1f}%)")
print(f"   - EDA:  {sum(train_df['label']==1)} ({sum(train_df['label']==1)/len(train_df)*100:.1f}%)")
print(f"\nüìä Testing set: {len(test_df)} samples")
print(f"   - NEDA: {sum(test_df['label']==0)} ({sum(test_df['label']==0)/len(test_df)*100:.1f}%)")
print(f"   - EDA:  {sum(test_df['label']==1)} ({sum(test_df['label']==1)/len(test_df)*100:.1f}%)")



üè∑Ô∏è Processing labels...
‚úÖ Labels extracted successfully!

üìä Training set: 6472 samples
   - NEDA: 4989 (77.1%)
   - EDA:  1483 (22.9%)

üìä Testing set: 3373 samples
   - NEDA: 2630 (78.0%)
   - EDA:  743 (22.0%)


In [None]:
# ============================================================================
# STEP 7: Split Training Data into Train and Validation Sets
# ============================================================================
print("\n‚úÇÔ∏è Splitting training data into train and validation sets...")

train_data, val_data = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['label']  # Maintain class balance
)

print(f"‚úÖ Final split:")
print(f"   - Training:   {len(train_data)} samples")
print(f"   - Validation: {len(val_data)} samples")
print(f"   - Testing:    {len(test_df)} samples")


‚úÇÔ∏è Splitting training data into train and validation sets...
‚úÖ Final split:
   - Training:   5177 samples
   - Validation: 1295 samples
   - Testing:    3373 samples


In [None]:
# ============================================================================
# STEP 8: Compute Class Weights for Imbalanced Data
# ============================================================================
print("\n‚öñÔ∏è Computing class weights to handle class imbalance...")

train_labels = train_data['label'].values
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

print(f"‚úÖ Class weights computed:")
print(f"   - NEDA (class 0): {class_weights[0]:.4f}")
print(f"   - EDA  (class 1): {class_weights[1]:.4f}")


‚öñÔ∏è Computing class weights to handle class imbalance...
‚úÖ Class weights computed:
   - NEDA (class 0): 0.6486
   - EDA  (class 1): 2.1825


In [None]:
# ============================================================================
# STEP 9: Initialize Tokenizer and Model
# ============================================================================
print("\nü§ñ Loading Funnel Transformer model and tokenizer...")

MODEL_NAME = "funnel-transformer/medium"

tokenizer = FunnelTokenizer.from_pretrained(MODEL_NAME)
model = FunnelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# Count parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"‚úÖ Model loaded: {MODEL_NAME}")
print(f"‚úÖ Number of parameters: {n_params:,}")



ü§ñ Loading Funnel Transformer model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/153 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]

Some weights of FunnelForSequenceClassification were not initialized from the model checkpoint at funnel-transformer/medium and are newly initialized: ['classifier.linear_hidden.bias', 'classifier.linear_hidden.weight', 'classifier.linear_out.bias', 'classifier.linear_out.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model loaded: funnel-transformer/medium
‚úÖ Number of parameters: 116,203,778


In [None]:
# ============================================================================
# STEP 10: Tokenize Data
# ============================================================================
print("\nüî§ Tokenizing data...")

def tokenize_function(examples):
    """Tokenize input texts"""
    return tokenizer(
        examples['input'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data[['input', 'label']])
val_dataset = Dataset.from_pandas(val_data[['input', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input', 'label']])

# Tokenize datasets
print("   Tokenizing training set...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
print("   Tokenizing validation set...")
val_dataset = val_dataset.map(tokenize_function, batched=True)
print("   Tokenizing test set...")
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"‚úÖ Tokenization complete!")
print(f"   - Training dataset:   {len(train_dataset)} samples")
print(f"   - Validation dataset: {len(val_dataset)} samples")
print(f"   - Test dataset:       {len(test_dataset)} samples")


üî§ Tokenizing data...
   Tokenizing training set...


Map:   0%|          | 0/5177 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/524M [00:00<?, ?B/s]

   Tokenizing validation set...


Map:   0%|          | 0/1295 [00:00<?, ? examples/s]

   Tokenizing test set...


Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

‚úÖ Tokenization complete!
   - Training dataset:   5177 samples
   - Validation dataset: 1295 samples
   - Test dataset:       3373 samples


In [None]:
# ============================================================================
# STEP 11: Define Evaluation Metrics
# ============================================================================
print("\nüìä Setting up evaluation metrics...")

def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    logits, labels = eval_pred

    # Convert logits to tensor if needed
    logits = torch.tensor(logits) if isinstance(logits, np.ndarray) else logits

    # Get predictions
    predictions = torch.argmax(logits, axis=-1).cpu().numpy()
    labels = labels if isinstance(labels, np.ndarray) else labels.cpu().numpy()

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary', zero_division=0
    )
    cm = confusion_matrix(labels, predictions)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm.tolist()
    }

print("‚úÖ Evaluation metrics configured!")


print("‚úÖ Custom trainer class created!")





üìä Setting up evaluation metrics...
‚úÖ Evaluation metrics configured!
‚úÖ Custom trainer class created!


In [None]:
# ============================================================================
# STEP 12: Create Custom Trainer with Weighted Loss
# ============================================================================
print("\nüéØ Creating custom trainer with weighted loss...")

class WeightedLossTrainer(Trainer):
    """Custom Trainer with weighted cross-entropy loss"""

    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Use weighted cross-entropy loss
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss



üéØ Creating custom trainer with weighted loss...


In [None]:
# ============================================================================
# STEP 13: Setup Training Arguments
# ============================================================================
print("\n‚öôÔ∏è Configuring training arguments...")

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    report_to='none',
    push_to_hub=False,
    warmup_ratio=0.2,
    lr_scheduler_type='linear',
    seed=42,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
)

print("‚úÖ Training arguments configured!")
print(f"   - Learning rate: {training_args.learning_rate}")
print(f"   - Batch size: {training_args.per_device_train_batch_size}")
print(f"   - Epochs: {training_args.num_train_epochs}")
print(f"   - Warmup ratio: {training_args.warmup_ratio}")






‚öôÔ∏è Configuring training arguments...
‚úÖ Training arguments configured!
   - Learning rate: 2e-05
   - Batch size: 32
   - Epochs: 30
   - Warmup ratio: 0.2


In [None]:
# ============================================================================
# STEP 14: Initialize Trainer
# ============================================================================
print("\nüéØ Initializing weighted trainer...")

trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use validation set during training
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor,
)

print("‚úÖ Trainer initialized successfully!")





üéØ Initializing weighted trainer...
‚úÖ Trainer initialized successfully!


In [None]:
# ============================================================================
# STEP 15: Train the Model
# ============================================================================
print("\n" + "="*70)
print("üöÄ STARTING TRAINING")
print("="*70)
print("\nThis may take a while. Training progress will be shown below...")
print("-"*70)

train_result = trainer.train()

print("\n" + "="*70)
print("‚úÖ TRAINING COMPLETED!")
print("="*70)
print(f"üìä Final Training Loss: {train_result.training_loss:.4f}")
print(f"‚è±Ô∏è Training Time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"‚ö° Training Speed: {train_result.metrics['train_samples_per_second']:.2f} samples/sec")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 97, 'bos_token_id': 96, 'pad_token_id': 0}.



üöÄ STARTING TRAINING

This may take a while. Training progress will be shown below...
----------------------------------------------------------------------


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Confusion Matrix
1,0.6937,0.689956,0.611583,0.288934,0.474747,0.359236,"[[651, 347], [156, 141]]"
2,0.6836,0.677108,0.499614,0.271186,0.700337,0.390977,"[[439, 559], [89, 208]]"
3,0.6785,0.664724,0.542085,0.293296,0.707071,0.41461,"[[492, 506], [87, 210]]"
4,0.647,0.672585,0.665637,0.334146,0.461279,0.387553,"[[725, 273], [160, 137]]"
5,0.6655,0.695137,0.702703,0.360759,0.383838,0.371941,"[[796, 202], [183, 114]]"
6,0.6839,0.699404,0.731274,0.41115,0.397306,0.40411,"[[829, 169], [179, 118]]"
7,0.6913,0.729864,0.751351,0.420382,0.222222,0.290749,"[[907, 91], [231, 66]]"
8,0.6371,0.663823,0.711197,0.368601,0.363636,0.366102,"[[813, 185], [189, 108]]"
9,0.634,0.668686,0.674131,0.359551,0.538721,0.431267,"[[713, 285], [137, 160]]"
10,0.614,0.659359,0.636293,0.333333,0.585859,0.424908,"[[650, 348], [123, 174]]"



‚úÖ TRAINING COMPLETED!
üìä Final Training Loss: 0.5233
‚è±Ô∏è Training Time: 2101.13 seconds
‚ö° Training Speed: 73.92 samples/sec


In [None]:
# ============================================================================
# STEP 16: Evaluate on Validation Set
# ============================================================================
print("\n" + "="*70)
print("üìä VALIDATION SET EVALUATION")
print("="*70)

val_results = trainer.evaluate(val_dataset)

print(f"\n‚úÖ Validation Accuracy:  {val_results['eval_accuracy']:.4f}")
print(f"‚úÖ Validation Precision: {val_results['eval_precision']:.4f}")
print(f"‚úÖ Validation Recall:    {val_results['eval_recall']:.4f}")
print(f"‚úÖ Validation F1 Score:  {val_results['eval_f1']:.4f}")
print(f"\nüìä Validation Confusion Matrix:")
cm = val_results['eval_confusion_matrix']
print(f"                  Predicted NEDA | Predicted EDA")
print(f"Actual NEDA:      {cm[0][0]:>14} | {cm[0][1]:>13}")
print(f"Actual EDA:       {cm[1][0]:>14} | {cm[1][1]:>13}")




üìä VALIDATION SET EVALUATION



‚úÖ Validation Accuracy:  0.7073
‚úÖ Validation Precision: 0.3921
‚úÖ Validation Recall:    0.5017
‚úÖ Validation F1 Score:  0.4402

üìä Validation Confusion Matrix:
                  Predicted NEDA | Predicted EDA
Actual NEDA:                 767 |           231
Actual EDA:                  148 |           149


In [None]:
# ============================================================================
# STEP 17: Final Evaluation on Test Set (Held-Out Data)
# ============================================================================
print("\n" + "="*70)
print("üéØ FINAL TEST SET EVALUATION (HELD-OUT DATA)")
print("="*70)

test_results = trainer.evaluate(test_dataset)

print(f"\n‚úÖ Test Accuracy:  {test_results['eval_accuracy']:.4f}")
print(f"‚úÖ Test Precision: {test_results['eval_precision']:.4f}")
print(f"‚úÖ Test Recall:    {test_results['eval_recall']:.4f}")
print(f"‚úÖ Test F1 Score:  {test_results['eval_f1']:.4f}")
print(f"\nüìä Test Confusion Matrix:")
cm = test_results['eval_confusion_matrix']
print(f"                  Predicted NEDA | Predicted EDA")
print(f"Actual NEDA:      {cm[0][0]:>14} | {cm[0][1]:>13}")
print(f"Actual EDA:       {cm[1][0]:>14} | {cm[1][1]:>13}")




üéØ FINAL TEST SET EVALUATION (HELD-OUT DATA)



‚úÖ Test Accuracy:  0.6537
‚úÖ Test Precision: 0.3141
‚úÖ Test Recall:    0.4832
‚úÖ Test F1 Score:  0.3807

üìä Test Confusion Matrix:
                  Predicted NEDA | Predicted EDA
Actual NEDA:                1846 |           784
Actual EDA:                  384 |           359


In [None]:
# ============================================================================
# STEP 18: Save the Best Model
# ============================================================================
print("\n" + "="*70)
print("üíæ SAVING MODEL")
print("="*70)

# Save model locally
save_path = './best_funnel_model'
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Model saved to: {save_path}")
print(f"‚úÖ Tokenizer saved to: {save_path}")



üíæ SAVING MODEL
‚úÖ Model saved to: ./best_funnel_model
‚úÖ Tokenizer saved to: ./best_funnel_model


In [None]:
# ============================================================================
# STEP 19: Summary
# ============================================================================
print("\n" + "="*70)
print("üéâ TRAINING PIPELINE COMPLETED SUCCESSFULLY!")
print("="*70)
print(f"\nüìä Final Results Summary:")
print(f"   Training samples:   {len(train_dataset)}")
print(f"   Validation samples: {len(val_dataset)}")
print(f"   Test samples:       {len(test_dataset)}")
print(f"\n   Test Accuracy:  {test_results['eval_accuracy']:.4f}")
print(f"   Test Precision: {test_results['eval_precision']:.4f}")
print(f"   Test Recall:    {test_results['eval_recall']:.4f}")
print(f"   Test F1 Score:  {test_results['eval_f1']:.4f}")
print(f"\nüíæ Model saved at: {save_path}")
print("="*70)


üéâ TRAINING PIPELINE COMPLETED SUCCESSFULLY!

üìä Final Results Summary:
   Training samples:   5177
   Validation samples: 1295
   Test samples:       3373

   Test Accuracy:  0.6537
   Test Precision: 0.3141
   Test Recall:    0.4832
   Test F1 Score:  0.3807

üíæ Model saved at: ./best_funnel_model


##ModernBERT

In [None]:

# ============================================================================
# STEP 1: Install Required Libraries
# ============================================================================
print("üì¶ Installing required libraries...")
!pip install -q transformers datasets openpyxl scikit-learn huggingface_hub accelerate


üì¶ Installing required libraries...


In [None]:

# ============================================================================
# STEP 2: Import Libraries
# ============================================================================
print("\nüìö Importing libraries...")

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from google.colab import files, userdata
from transformers import (
    FunnelTokenizer,
    FunnelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from huggingface_hub import login
import warnings
warnings.filterwarnings('ignore')
from transformers import PreTrainedTokenizerFast, BertForSequenceClassification


print("‚úÖ Libraries imported successfully!")



üìö Importing libraries...
‚úÖ Libraries imported successfully!


In [None]:
# ============================================================================
# STEP 3: Login to Hugging Face (Optional but Recommended)
# ============================================================================
print("\nüîê Logging into Hugging Face...")

try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    login(token=HF_TOKEN)
    print("‚úÖ Successfully logged into Hugging Face!")
except Exception as e:
    print(f"‚ö†Ô∏è Could not login to HF: {e}")
    print("Continuing without HF login...")


üîê Logging into Hugging Face...
‚úÖ Successfully logged into Hugging Face!


In [None]:
# ============================================================================
# STEP 4: Upload Training and Testing Files
# ============================================================================
print("\nüìÅ Please upload your Excel files...")
print("Expected format: 'input' column (text) and 'output' column (labels)")

# Check if files already exist
if not os.path.exists('/content/train.xlsx'):
    print("\nüì§ Upload TRAINING file (train.xlsx):")
    train_uploaded = files.upload()
else:
    print("‚úÖ Training file already exists!")

if not os.path.exists('/content/test.xlsx'):
    print("\nüì§ Upload TESTING file (test.xlsx):")
    test_uploaded = files.upload()
else:
    print("‚úÖ Testing file already exists!")



üìÅ Please upload your Excel files...
Expected format: 'input' column (text) and 'output' column (labels)

üì§ Upload TRAINING file (train.xlsx):


Saving train.xlsx to train.xlsx

üì§ Upload TESTING file (test.xlsx):


Saving test.xlsx to test.xlsx


In [None]:
# ============================================================================
# STEP 5: Load Data from Excel Files
# ============================================================================
print("\nüìä Loading data from Excel files...")

train_df = pd.read_excel('/content/train.xlsx')
test_df = pd.read_excel('/content/test.xlsx')

print(f"‚úÖ Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
print(f"‚úÖ Testing data loaded: {test_df.shape[0]} rows, {test_df.shape[1]} columns")

# Preview data
print("\nüìã Training data preview:")
print(train_df.head())
print("\nüìã Testing data preview:")
print(test_df.head())


üìä Loading data from Excel files...
‚úÖ Training data loaded: 6472 rows, 3 columns
‚úÖ Testing data loaded: 3373 rows, 3 columns

üìã Training data preview:
   MSC research database ID  \
0                         1   
1                         1   
2                         1   
3                         1   
4                         1   

                                               input  \
0  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
1  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
2  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
3  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
4  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   

                                        output  
0  After 7 months the patient will be in NEDA.  
1   After 7 months the patient will be in EDA.  
2   After 0 months the patient will be in EDA.  
3   After 4 months the patient will be in EDA.  
4  After 6 months the patient will be in NEDA.  

üìã Testing data pr

In [None]:
# ============================================================================
# STEP 6: Process Labels
# ============================================================================
print("\nüè∑Ô∏è Processing labels...")

def extract_label(output_text):
    """Extract NEDA or EDA from output text"""
    output_text = str(output_text).upper()
    if 'NEDA' in output_text:
        return 'NEDA'
    elif 'EDA' in output_text:
        return 'EDA'
    else:
        return None

# Apply label extraction
train_df['label_text'] = train_df['output'].apply(extract_label)
test_df['label_text'] = test_df['output'].apply(extract_label)

# Remove rows with None labels
train_df = train_df[train_df['label_text'].notna()].reset_index(drop=True)
test_df = test_df[test_df['label_text'].notna()].reset_index(drop=True)

# Create label mapping
label2id = {'NEDA': 0, 'EDA': 1}
id2label = {0: 'NEDA', 1: 'EDA'}

# Convert to numeric labels
train_df['label'] = train_df['label_text'].map(label2id)
test_df['label'] = test_df['label_text'].map(label2id)

print(f"‚úÖ Labels extracted successfully!")
print(f"\nüìä Training set: {len(train_df)} samples")
print(f"   - NEDA: {sum(train_df['label']==0)} ({sum(train_df['label']==0)/len(train_df)*100:.1f}%)")
print(f"   - EDA:  {sum(train_df['label']==1)} ({sum(train_df['label']==1)/len(train_df)*100:.1f}%)")
print(f"\nüìä Testing set: {len(test_df)} samples")
print(f"   - NEDA: {sum(test_df['label']==0)} ({sum(test_df['label']==0)/len(test_df)*100:.1f}%)")
print(f"   - EDA:  {sum(test_df['label']==1)} ({sum(test_df['label']==1)/len(test_df)*100:.1f}%)")



üè∑Ô∏è Processing labels...
‚úÖ Labels extracted successfully!

üìä Training set: 6472 samples
   - NEDA: 4989 (77.1%)
   - EDA:  1483 (22.9%)

üìä Testing set: 3373 samples
   - NEDA: 2630 (78.0%)
   - EDA:  743 (22.0%)


In [None]:
# ============================================================================
# STEP 7: Split Training Data into Train and Validation Sets
# ============================================================================
print("\n‚úÇÔ∏è Splitting training data into train and validation sets...")

train_data, val_data = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['label']  # Maintain class balance
)

print(f"‚úÖ Final split:")
print(f"   - Training:   {len(train_data)} samples")
print(f"   - Validation: {len(val_data)} samples")
print(f"   - Testing:    {len(test_df)} samples")


‚úÇÔ∏è Splitting training data into train and validation sets...
‚úÖ Final split:
   - Training:   5177 samples
   - Validation: 1295 samples
   - Testing:    3373 samples


In [None]:
# ============================================================================
# STEP 8: Compute Class Weights for Imbalanced Data
# ============================================================================
print("\n‚öñÔ∏è Computing class weights to handle class imbalance...")

train_labels = train_data['label'].values
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

print(f"‚úÖ Class weights computed:")
print(f"   - NEDA (class 0): {class_weights[0]:.4f}")
print(f"   - EDA  (class 1): {class_weights[1]:.4f}")


‚öñÔ∏è Computing class weights to handle class imbalance...
‚úÖ Class weights computed:
   - NEDA (class 0): 0.6486
   - EDA  (class 1): 2.1825


In [None]:
# ============================================================================
# STEP 9: Initialize Tokenizer and Model
# ============================================================================
print("\nü§ñ Loading Funnel Transformer model and tokenizer...")

MODEL_NAME = "answerdotai/ModernBERT-base"

tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)


# Count parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"‚úÖ Model loaded: {MODEL_NAME}")
print(f"‚úÖ Number of parameters: {n_params:,}")



ü§ñ Loading Funnel Transformer model and tokenizer...


You are using a model of type modernbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.

‚úÖ Model loaded: answerdotai/ModernBERT-base
‚úÖ Number of parameters: 136,579,586


In [None]:
# ============================================================================
# STEP 10: Tokenize Data
# ============================================================================
print("\nüî§ Tokenizing data...")

def tokenize_function(examples):
    """Tokenize input texts"""
    return tokenizer(
        examples['input'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data[['input', 'label']])
val_dataset = Dataset.from_pandas(val_data[['input', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input', 'label']])

# Tokenize datasets
print("   Tokenizing training set...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
print("   Tokenizing validation set...")
val_dataset = val_dataset.map(tokenize_function, batched=True)
print("   Tokenizing test set...")
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"‚úÖ Tokenization complete!")
print(f"   - Training dataset:   {len(train_dataset)} samples")
print(f"   - Validation dataset: {len(val_dataset)} samples")
print(f"   - Test dataset:       {len(test_dataset)} samples")


üî§ Tokenizing data...
   Tokenizing training set...


Map:   0%|          | 0/5177 [00:00<?, ? examples/s]

   Tokenizing validation set...


Map:   0%|          | 0/1295 [00:00<?, ? examples/s]

   Tokenizing test set...


Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

‚úÖ Tokenization complete!
   - Training dataset:   5177 samples
   - Validation dataset: 1295 samples
   - Test dataset:       3373 samples


In [None]:
# ============================================================================
# STEP 11: Define Evaluation Metrics
# ============================================================================
print("\nüìä Setting up evaluation metrics...")

def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    logits, labels = eval_pred

    # Convert logits to tensor if needed
    logits = torch.tensor(logits) if isinstance(logits, np.ndarray) else logits

    # Get predictions
    predictions = torch.argmax(logits, axis=-1).cpu().numpy()
    labels = labels if isinstance(labels, np.ndarray) else labels.cpu().numpy()

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary', zero_division=0
    )
    cm = confusion_matrix(labels, predictions)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm.tolist()
    }

print("‚úÖ Evaluation metrics configured!")


print("‚úÖ Custom trainer class created!")





üìä Setting up evaluation metrics...
‚úÖ Evaluation metrics configured!
‚úÖ Custom trainer class created!


In [None]:
# ============================================================================
# STEP 12: Create Custom Trainer with Weighted Loss
# ============================================================================
print("\nüéØ Creating custom trainer with weighted loss...")

class WeightedLossTrainer(Trainer):
    """Custom Trainer with weighted cross-entropy loss"""

    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Use weighted cross-entropy loss
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss



üéØ Creating custom trainer with weighted loss...


In [None]:
# ============================================================================
# STEP 13: Setup Training Arguments
# ============================================================================
print("\n‚öôÔ∏è Configuring training arguments...")

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    report_to='none',
    push_to_hub=False,
    warmup_ratio=0.1,  # Adjust warmup ratio
    lr_scheduler_type='cosine',  # Use a different scheduler
    seed=42,
    gradient_accumulation_steps=1,  # Adjust gradient accumulation
    max_grad_norm=0.5,  # Reduce gradient clipping
)


print("‚úÖ Training arguments configured!")
print(f"   - Learning rate: {training_args.learning_rate}")
print(f"   - Batch size: {training_args.per_device_train_batch_size}")
print(f"   - Epochs: {training_args.num_train_epochs}")
print(f"   - Warmup ratio: {training_args.warmup_ratio}")






‚öôÔ∏è Configuring training arguments...
‚úÖ Training arguments configured!
   - Learning rate: 0.0001
   - Batch size: 16
   - Epochs: 10
   - Warmup ratio: 0.1


In [None]:
# ============================================================================
# STEP 14: Initialize Trainer
# ============================================================================
print("\nüéØ Initializing weighted trainer...")

trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use validation set during training
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor,
)

print("‚úÖ Trainer initialized successfully!")





üéØ Initializing weighted trainer...
‚úÖ Trainer initialized successfully!


In [None]:
# ============================================================================
# STEP 15: Train the Model
# ============================================================================
print("\n" + "="*70)
print("üöÄ STARTING TRAINING")
print("="*70)
print("\nThis may take a while. Training progress will be shown below...")
print("-"*70)

train_result = trainer.train()

print("\n" + "="*70)
print("‚úÖ TRAINING COMPLETED!")
print("="*70)
print(f"üìä Final Training Loss: {train_result.training_loss:.4f}")
print(f"‚è±Ô∏è Training Time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"‚ö° Training Speed: {train_result.metrics['train_samples_per_second']:.2f} samples/sec")



üöÄ STARTING TRAINING

This may take a while. Training progress will be shown below...
----------------------------------------------------------------------


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Confusion Matrix
1,0.7502,0.713667,0.770656,0.0,0.0,0.0,"[[998, 0], [297, 0]]"
2,0.7277,0.692187,0.770656,0.0,0.0,0.0,"[[998, 0], [297, 0]]"
3,0.6885,0.749163,0.229344,0.229344,1.0,0.373116,"[[0, 998], [0, 297]]"


KeyboardInterrupt: 

In [None]:
# ============================================================================
# STEP 16: Evaluate on Validation Set
# ============================================================================
print("\n" + "="*70)
print("üìä VALIDATION SET EVALUATION")
print("="*70)

val_results = trainer.evaluate(val_dataset)

print(f"\n‚úÖ Validation Accuracy:  {val_results['eval_accuracy']:.4f}")
print(f"‚úÖ Validation Precision: {val_results['eval_precision']:.4f}")
print(f"‚úÖ Validation Recall:    {val_results['eval_recall']:.4f}")
print(f"‚úÖ Validation F1 Score:  {val_results['eval_f1']:.4f}")
print(f"\nüìä Validation Confusion Matrix:")
cm = val_results['eval_confusion_matrix']
print(f"                  Predicted NEDA | Predicted EDA")
print(f"Actual NEDA:      {cm[0][0]:>14} | {cm[0][1]:>13}")
print(f"Actual EDA:       {cm[1][0]:>14} | {cm[1][1]:>13}")



In [None]:
# ============================================================================
# STEP 17: Final Evaluation on Test Set (Held-Out Data)
# ============================================================================
print("\n" + "="*70)
print("üéØ FINAL TEST SET EVALUATION (HELD-OUT DATA)")
print("="*70)

test_results = trainer.evaluate(test_dataset)

print(f"\n‚úÖ Test Accuracy:  {test_results['eval_accuracy']:.4f}")
print(f"‚úÖ Test Precision: {test_results['eval_precision']:.4f}")
print(f"‚úÖ Test Recall:    {test_results['eval_recall']:.4f}")
print(f"‚úÖ Test F1 Score:  {test_results['eval_f1']:.4f}")
print(f"\nüìä Test Confusion Matrix:")
cm = test_results['eval_confusion_matrix']
print(f"                  Predicted NEDA | Predicted EDA")
print(f"Actual NEDA:      {cm[0][0]:>14} | {cm[0][1]:>13}")
print(f"Actual EDA:       {cm[1][0]:>14} | {cm[1][1]:>13}")



In [None]:
# ============================================================================
# STEP 18: Save the Best Model
# ============================================================================
print("\n" + "="*70)
print("üíæ SAVING MODEL")
print("="*70)

# Save model locally
save_path = './best_funnel_model'
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Model saved to: {save_path}")
print(f"‚úÖ Tokenizer saved to: {save_path}")


In [None]:
# ============================================================================
# STEP 19: Summary
# ============================================================================
print("\n" + "="*70)
print("üéâ TRAINING PIPELINE COMPLETED SUCCESSFULLY!")
print("="*70)
print(f"\nüìä Final Results Summary:")
print(f"   Training samples:   {len(train_dataset)}")
print(f"   Validation samples: {len(val_dataset)}")
print(f"   Test samples:       {len(test_dataset)}")
print(f"\n   Test Accuracy:  {test_results['eval_accuracy']:.4f}")
print(f"   Test Precision: {test_results['eval_precision']:.4f}")
print(f"   Test Recall:    {test_results['eval_recall']:.4f}")
print(f"   Test F1 Score:  {test_results['eval_f1']:.4f}")
print(f"\nüíæ Model saved at: {save_path}")
print("="*70)

##Gemma 3n

In [None]:
# Install necessary libraries
!pip install -q transformers datasets openpyxl scikit-learn
import numpy as np
import os
import pandas as pd
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Check if the required files exist and load them
train_file = '/content/train.xlsx'
test_file = '/content/test.xlsx'

# Ensure files are uploaded
if not os.path.exists(train_file) or not os.path.exists(test_file):
    print("Please upload 'train.xlsx' and 'test.xlsx'.")
else:
    # Load the training and testing data
    train_df = pd.read_excel(train_file)
    test_df = pd.read_excel(test_file)

    # Rename columns to match the expected format (text, label)
    train_df = train_df.rename(columns={'input': 'text', 'output': 'label'})
    test_df = test_df.rename(columns={'input': 'text', 'output': 'label'})

    # Convert labels to numeric values
    train_df['label'] = train_df['label'].astype('category').cat.codes
    test_df['label'] = test_df['label'].astype('category').cat.codes

    # Split the test dataset into validation and test sets
    test_df, val_df = train_test_split(test_df, test_size=0.2, random_state=42)

    # Convert the DataFrames to datasets
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    val_dataset = Dataset.from_pandas(val_df)

    # Combine the datasets into a DatasetDict for easier handling
    datasets = DatasetDict({
        'train': train_dataset,
        'test': test_dataset,
        'validation': val_dataset
    })

    # Initialize the tokenizer and model
    model_name = "google/gemma-3-1b-it"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(train_df['label'].unique()))

    # Tokenize the dataset with a fixed max_length
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

    tokenized_datasets = datasets.map(tokenize_function, batched=True)

    # Remove unnecessary columns
    tokenized_datasets = tokenized_datasets.remove_columns(['text'])

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        eval_strategy='epoch',
        save_strategy='epoch',
        learning_rate=1e-4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=2,
        report_to='none',
        push_to_hub=False,
        warmup_ratio=0.1,  # Adjust warmup ratio
        lr_scheduler_type='cosine',  # Use a different scheduler
        seed=42,
        gradient_accumulation_steps=1,  # Adjust gradient accumulation
        max_grad_norm=0.5,  # Reduce gradient clipping
    )

    # Define a compute_metrics function for multiple metrics
    # Define a compute_metrics function for multiple metrics
    def compute_metrics(p):
        predictions, labels = p
        # Ensure predictions are a tensor before applying torch.argmax
        predictions = torch.tensor(predictions) if isinstance(predictions, np.ndarray) else predictions
        predictions = torch.argmax(predictions, axis=-1)

        # Calculate metrics
        accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, average='weighted')
        recall = recall_score(labels, predictions, average='weighted')
        f1 = f1_score(labels, predictions, average='weighted')

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }


    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        compute_metrics=compute_metrics
    )

    # Train the model
    print("\nüöÄ Starting training...")
    trainer.train()

    # Save the trained model
    trainer.save_model('./final_model')

    # Evaluate on the test set manually
    print("\nüöÄ Evaluating on the test dataset...")
    results = trainer.evaluate(tokenized_datasets['test'])
    print(f"Test Results: {results}")


Some weights of Gemma3TextForSequenceClassification were not initialized from the model checkpoint at google/gemma-3-1b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6472 [00:00<?, ? examples/s]

Map:   0%|          | 0/2698 [00:00<?, ? examples/s]

Map:   0%|          | 0/675 [00:00<?, ? examples/s]


üöÄ Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5612,7.025692,0.001481,4.9e-05,0.001481,9.6e-05
2,3.4726,7.084282,0.005926,3.5e-05,0.005926,7e-05


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

##Longformer

In [None]:
# ============================================================================
# STEP 1: Install Required Libraries
# ============================================================================
print("üì¶ Installing required libraries...")
!pip install -q transformers datasets openpyxl scikit-learn huggingface_hub

# ============================================================================
# STEP 2: Import Libraries
# ============================================================================
import os
import pandas as pd
import torch
from google.colab import files, userdata
from transformers import LongformerTokenizer, LongformerForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

import numpy as np


# ============================================================================
# STEP 3: Check if the Files are Uploaded
# ============================================================================
if not os.path.exists('/mnt/data/train.xlsx') or not os.path.exists('/mnt/data/test.xlsx'):
    print("üìÅ Please upload the training and test Excel files...")
    train_uploaded = files.upload()
    test_uploaded = files.upload()
else:
    print("‚úÖ Files already uploaded!")

# ============================================================================
# STEP 4: Load Data from Excel Files
# ============================================================================
train_df = pd.read_excel('/content/train.xlsx')
test_df = pd.read_excel('/content/test.xlsx')

print(f"‚úÖ Training data shape: {train_df.shape}")
print(f"‚úÖ Test data shape: {test_df.shape}")

# Inspect the first few rows to ensure data is loaded correctly
print("\nüìã Training data preview:")
print(train_df.head())
print("\nüìã Test data preview:")
print(test_df.head())

# ============================================================================
# STEP 5: Process Labels
# ============================================================================
print("\nüè∑Ô∏è Processing labels...")

# Ensure columns 'input' and 'output' are present in your dataset
def extract_label(output_text):
    """Extract NEDA or EDA from output text"""
    output_text = str(output_text).upper()
    if 'NEDA' in output_text:
        return 'NEDA'
    elif 'EDA' in output_text:
        return 'EDA'
    else:
        return None

# Apply label extraction
train_df['label_text'] = train_df['output'].apply(extract_label)
test_df['label_text'] = test_df['output'].apply(extract_label)

# Remove rows with None labels
train_df = train_df[train_df['label_text'].notna()].reset_index(drop=True)
test_df = test_df[test_df['label_text'].notna()].reset_index(drop=True)

# Create label mapping
label2id = {'NEDA': 0, 'EDA': 1}
id2label = {0: 'NEDA', 1: 'EDA'}

# Convert to numeric labels
train_df['label'] = train_df['label_text'].map(label2id)
test_df['label'] = test_df['label_text'].map(label2id)

print(f"‚úÖ Labels extracted successfully!")
print(f"üìä Training set after label extraction: {len(train_df)} samples")
print(f"üìä Test set after label extraction: {len(test_df)} samples")

# ============================================================================
# STEP 6: Train-Test Split (for Validation)
# ============================================================================
print("\nüìä Splitting training data into training and validation sets...")

train_dataset, val_dataset = train_test_split(train_df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_dataset[['input', 'label']])
val_dataset = Dataset.from_pandas(val_dataset[['input', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input', 'label']])

# ============================================================================
# STEP 7: Initialize Tokenizer and Model (Longformer)
# ============================================================================
print("\nü§ñ Loading Longformer model and tokenizer...")

MODEL_NAME = "allenai/longformer-base-4096"

tokenizer = LongformerTokenizer.from_pretrained(MODEL_NAME)
model = LongformerForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# Count parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"‚úÖ Model loaded: {MODEL_NAME}")
print(f"‚úÖ Number of parameters: {n_params:,}")

# ============================================================================
# STEP 8: Tokenize Data
# ============================================================================
print("\nüî§ Tokenizing data...")

def tokenize_function(examples):
    """Tokenize input texts"""
    return tokenizer(
        examples['input'],
        padding='max_length',
        truncation=True,
        max_length=1024   # Longformer-friendly max length
    )

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"‚úÖ Training dataset: {len(train_dataset)} samples")
print(f"‚úÖ Validation dataset: {len(val_dataset)} samples")
print(f"‚úÖ Test dataset: {len(test_dataset)} samples")

# ============================================================================
# STEP 9: Define Evaluation Metrics
# ============================================================================
print("\nüìä Setting up evaluation metrics...")

def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    logits, labels = eval_pred

    # Convert logits to tensor (if they are numpy arrays)
    logits = torch.tensor(logits) if isinstance(logits, np.ndarray) else logits

    # Get predictions by applying argmax on logits
    predictions = torch.argmax(logits, axis=-1)

    # Compute accuracy, precision, recall, and F1 score
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary', zero_division=0
    )
    cm = confusion_matrix(labels, predictions)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm.tolist()
    }


print("‚úÖ Evaluation metrics configured!")

# ============================================================================
# STEP 10: Setup Training Arguments
# ============================================================================
print("\n‚öôÔ∏è Configuring training arguments...")

# Adjusted batch size and sequence length
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,   # Reduced eval batch size
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    report_to='none',  # Disable wandb/tensorboard
    push_to_hub=False,  # Set to True if you want to push to Hub
    optim='adamw_torch',  # Use AdamW optimizer (Adam with weight decay)
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    fp16=True,  # Enable mixed precision
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
)

print("‚úÖ Training arguments configured!")

# ============================================================================
# STEP 11: Initialize Trainer
# ============================================================================
print("\nüéØ Initializing Trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("‚úÖ Trainer initialized successfully!")

# ============================================================================
# STEP 12: Train the Model
# ============================================================================
print("\nüöÄ Starting training...")

train_result = trainer.train()

print("\n‚úÖ Training completed!")
print(f"üìä Training Loss: {train_result.training_loss:.4f}")
print(f"‚è±Ô∏è Training Time: {train_result.metrics['train_runtime']:.2f} seconds")

# ============================================================================
# STEP 13: Evaluate on Test Set
# ============================================================================
print("\nüìà Evaluating on test set...")

eval_results = trainer.evaluate()

print("\nüìä EVALUATION RESULTS")
print(f"‚úÖ Accuracy:  {eval_results['eval_accuracy']:.4f}")
print(f"‚úÖ Precision: {eval_results['eval_precision']:.4f}")
print(f"‚úÖ Recall:    {eval_results['eval_recall']:.4f}")
print(f"‚úÖ F1 Score:  {eval_results['eval_f1']:.4f}")
print(f"üéØ Confusion Matrix:")
print(f"   {eval_results['eval_confusion_matrix']}")

# ============================================================================
# STEP 14: Save Model
# ============================================================================
print("\nüíæ Saving fine-tuned model...")

model_save_path = './longformer_neda_eda_classifier'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"‚úÖ Model saved to: {model_save_path}")

# ============================================================================
# STEP 15: Download Trained Model
# ============================================================================
print("\nüíæ Would you like to download the trained model? (y/n)")
download_choice = input().lower().strip()

if download_choice == 'y':
    print("\nüì¶ Creating zip file...")
    !zip -r longformer_neda_eda_classifier.zip {model_save_path}
    print("‚¨áÔ∏è Downloading model...")
    files.download('longformer_neda_eda_classifier.zip')
    print("‚úÖ Model downloaded successfully!")

print("\nüéâ ALL DONE!")


üì¶ Installing required libraries...
üìÅ Please upload the training and test Excel files...


‚úÖ Training data shape: (6472, 3)
‚úÖ Test data shape: (3373, 3)

üìã Training data preview:
   MSC research database ID  \
0                         1   
1                         1   
2                         1   
3                         1   
4                         1   

                                               input  \
0  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
1  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
2  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
3  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   
4  Age:31.  Gender:female.  Diagnosis:rrms. Has n...   

                                        output  
0  After 7 months the patient will be in NEDA.  
1   After 7 months the patient will be in EDA.  
2   After 0 months the patient will be in EDA.  
3   After 4 months the patient will be in EDA.  
4  After 6 months the patient will be in NEDA.  

üìã Test data preview:
   MSC research database ID  \
0                         7   


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model loaded: allenai/longformer-base-4096
‚úÖ Number of parameters: 148,660,994

üî§ Tokenizing data...


Map:   0%|          | 0/5824 [00:00<?, ? examples/s]

Map:   0%|          | 0/648 [00:00<?, ? examples/s]

Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

‚úÖ Training dataset: 5824 samples
‚úÖ Validation dataset: 648 samples
‚úÖ Test dataset: 3373 samples

üìä Setting up evaluation metrics...
‚úÖ Evaluation metrics configured!

‚öôÔ∏è Configuring training arguments...
‚úÖ Training arguments configured!

üéØ Initializing Trainer...
‚úÖ Trainer initialized successfully!

üöÄ Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Confusion Matrix
1,0.4765,0.619023,0.736111,0.0,0.0,0.0,"[[477, 0], [171, 0]]"
2,0.5237,0.555857,0.733025,0.375,0.017544,0.03352,"[[472, 5], [168, 3]]"
3,0.4777,0.572686,0.734568,0.0,0.0,0.0,"[[476, 1], [171, 0]]"
4,0.589,0.564108,0.731481,0.434783,0.05848,0.103093,"[[464, 13], [161, 10]]"
5,0.5539,0.58107,0.736111,0.0,0.0,0.0,"[[477, 0], [171, 0]]"
6,0.4803,0.568499,0.734568,0.466667,0.040936,0.075269,"[[469, 8], [164, 7]]"
7,0.4736,0.591706,0.734568,0.466667,0.040936,0.075269,"[[469, 8], [164, 7]]"
8,0.558,0.563879,0.742284,0.543478,0.146199,0.230415,"[[456, 21], [146, 25]]"
9,0.2914,0.64721,0.736111,0.5,0.040936,0.075676,"[[470, 7], [164, 7]]"
10,0.481,0.555791,0.734568,0.466667,0.040936,0.075269,"[[469, 8], [164, 7]]"



‚úÖ Training completed!
üìä Training Loss: 0.4215
‚è±Ô∏è Training Time: 11151.35 seconds

üìà Evaluating on test set...



üìä EVALUATION RESULTS
‚úÖ Accuracy:  0.7315
‚úÖ Precision: 0.4845
‚úÖ Recall:    0.2749
‚úÖ F1 Score:  0.3507
üéØ Confusion Matrix:
   [[427, 50], [124, 47]]

üíæ Saving fine-tuned model...
‚úÖ Model saved to: ./longformer_neda_eda_classifier

üíæ Would you like to download the trained model? (y/n)
n

üéâ ALL DONE!


##T5

In [17]:
# Install the necessary libraries
!pip install transformers datasets evaluate huggingface_hub

# Import required libraries
import torch
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import os
from huggingface_hub import login
from google.colab import userdata
import gc

# --- Authentication using Hugging Face Token ---
hf_token = userdata.get('HF_TOKEN')  # Assuming your HF_TOKEN is saved in Colab's secret storage

if hf_token:
    login(token=hf_token)  # Login using the Hugging Face token
else:
    print("Hugging Face token not found!")

# --- LOAD DATA ---
train_data = pd.read_excel('/content/train.xlsx')  # Replace with your correct file path
test_data = pd.read_excel('/content/test.xlsx')  # Replace with your correct file path

# --- PREPROCESS ---
def preprocess_data(df):
    return pd.DataFrame({
        'input_text': df['input'],
        'labels': df['output'].apply(lambda x: 'EDA' if 'EDA' in x else 'NEDA')  # Use text labels for T5
    })

train_df = preprocess_data(train_data)
test_df = preprocess_data(test_data)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

split = train_dataset.train_test_split(test_size=0.2)
train_dataset, val_dataset = split["train"], split["test"]

# --- TOKENIZER ---
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")  # Corrected model path

def tokenize_fn(batch):
    inputs = tokenizer(batch['input_text'], padding="max_length", truncation=True, max_length=128)  # Reduced max_length
    labels = tokenizer(batch['labels'], padding="max_length", truncation=True, max_length=128)  # Reduced max_length
    inputs['labels'] = labels['input_ids']  # Set the 'labels' to the tokenized labels
    return inputs

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

# --- MODEL ---
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")  # Corrected model path

# --- DATA COLLATOR ---
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# --- METRICS ---
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = torch.argmax(torch.tensor(preds), dim=-1)[:, 0]  # Get the first token prediction for classification
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# --- TRAINING ARGS ---
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',  # We evaluate every epoch now
    save_strategy='epoch',  # Save after every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Adjusted to 1 to fit in memory
    per_device_eval_batch_size=1,   # Adjusted to 1 to fit in memory
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    report_to='none',  # Disable wandb/tensorboard
    push_to_hub=False,  # Set to True if you want to push to Hugging Face Hub
    optim='adamw_torch',  # Use AdamW optimizer (Adam with weight decay)
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    fp16=True,  # Enable mixed precision
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
)

# --- TRAINER ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# --- CLEAR GPU CACHE BEFORE TRAIN ---
torch.cuda.empty_cache()  # Clear GPU cache
gc.collect()  # Collect unreferenced memory

# --- TRAIN ---
trainer.train()

# --- EVALUATE ---
test_results = trainer.evaluate(test_dataset)
print(test_results)

# --- SAVE MODEL ---
model.save_pretrained("./fine_tuned_t5")




tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/5177 [00:00<?, ? examples/s]

Map:   0%|          | 0/1295 [00:00<?, ? examples/s]

Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 18.63 GiB. GPU 0 has a total capacity of 39.56 GiB of which 18.62 GiB is free. Process 17776 has 20.93 GiB memory in use. Of the allocated memory 19.62 GiB is allocated by PyTorch, and 826.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

###T5-Small with LoRA

In [2]:
# Install necessary libraries
!pip install transformers datasets evaluate huggingface_hub peft bitsandbytes accelerate

# Import required libraries
import torch
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
    TrainerCallback,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import os
from huggingface_hub import login
from google.colab import userdata
import gc
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

# --- Authentication using Hugging Face Token ---
hf_token = userdata.get('HF_TOKEN')

if hf_token:
    login(token=hf_token)
else:
    print("Hugging Face token not found!")

# --- LOAD DATA ---
train_data = pd.read_excel('/content/train.xlsx')
test_data = pd.read_excel('/content/test.xlsx')

# --- PREPROCESS ---
def preprocess_data(df):
    return pd.DataFrame({
        'input_text': df['input'],
        'labels': df['output'].apply(lambda x: 'EDA' if 'EDA' in x else 'NEDA')
    })

train_df = preprocess_data(train_data)
test_df = preprocess_data(test_data)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

split = train_dataset.train_test_split(test_size=0.2)
train_dataset, val_dataset = split["train"], split["test"]

# --- TOKENIZER ---
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")

def tokenize_fn(batch):
    inputs = tokenizer(
        batch['input_text'],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    labels = tokenizer(
        batch['labels'],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    # Replace padding token ids with -100 so they're ignored in loss
    labels['input_ids'] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels['input_ids']
    ]
    inputs['labels'] = labels['input_ids']
    return inputs

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

# --- QLoRA Configuration ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16  # Changed to bfloat16 for better stability
)

# --- MODEL ---
model = T5ForConditionalGeneration.from_pretrained(
    "google-t5/t5-small",
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare model for k-bit training (CRITICAL for QLoRA)
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Increased for better performance
    lora_alpha=32,  # 4x the rank
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- DATA COLLATOR ---
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# --- METRICS ---
def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # Replace -100 with pad_token_id for decoding
    preds = [[token if token != -100 else tokenizer.pad_token_id for token in pred] for pred in preds]
    labels = [[token if token != -100 else tokenizer.pad_token_id for token in label] for label in labels]

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Convert to binary (EDA=1, NEDA=0)
    pred_binary = [1 if 'EDA' in pred.upper() else 0 for pred in decoded_preds]
    label_binary = [1 if 'EDA' in label.upper() else 0 for label in decoded_labels]

    accuracy = accuracy_score(label_binary, pred_binary)
    precision, recall, f1, _ = precision_recall_fscore_support(
        label_binary, pred_binary, average='binary', zero_division=0
    )
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# --- MEMORY CLEANUP CALLBACK ---
class MemoryCleanupCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        gc.collect()
        torch.cuda.empty_cache()

# --- TRAINING ARGS ---
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-4,  # Higher LR for LoRA
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,  # Can be higher for eval
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    report_to='none',
    push_to_hub=False,
    optim='adamw_torch',
    fp16=False,
    bf16=True,  # Use bf16 with quantization
    gradient_accumulation_steps=2,
    gradient_checkpointing=False,  # DISABLED - incompatible with quantized models
    max_grad_norm=1.0,
    dataloader_pin_memory=True,
    dataloader_num_workers=2,
    warmup_steps=100,  # Added warmup
)

# --- TRAINER ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[MemoryCleanupCallback()],
)

# --- CLEAR GPU CACHE BEFORE TRAIN ---
torch.cuda.empty_cache()
gc.collect()

# --- TRAIN ---
print("Starting training...")
trainer.train()

# --- EVALUATE ---
print("\nEvaluating on test set...")
test_results = trainer.evaluate(test_dataset)
print(test_results)

# --- SAVE MODEL ---
print("\nSaving model...")
model.save_pretrained("./fine_tuned_t5_lora")
tokenizer.save_pretrained("./fine_tuned_t5_lora")
print("Training complete!")



Map:   0%|          | 0/5177 [00:00<?, ? examples/s]

Map:   0%|          | 0/1295 [00:00<?, ? examples/s]

Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850
Starting training...


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 18.02 GiB. GPU 0 has a total capacity of 39.56 GiB of which 17.95 GiB is free. Process 17714 has 21.60 GiB memory in use. Of the allocated memory 18.54 GiB is allocated by PyTorch, and 2.55 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
# Install necessary libraries
!pip install transformers datasets evaluate huggingface_hub peft bitsandbytes accelerate

# Import required libraries
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
    TrainerCallback,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import os
from huggingface_hub import login
from google.colab import userdata
import gc
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import evaluate

# --- Authentication using Hugging Face Token ---
hf_token = userdata.get('HF_TOKEN')

if hf_token:
    login(token=hf_token)
else:
    print("Hugging Face token not found!")

# --- LOAD DATA ---
train_data = pd.read_excel('/content/train.xlsx')
test_data = pd.read_excel('/content/test.xlsx')

# --- PREPROCESS DATA ---
def preprocess_data(df):
    """
    Preprocess data for DistilT5
    Adjust based on your task:
    - For classification: input ‚Üí label
    - For text generation: input ‚Üí output
    - For summarization: document ‚Üí summary
    """
    processed_data = []

    for idx, row in df.iterrows():
        # Adjust the input format based on your task
        # Example formats:
        # Classification: "classify: <text>"
        # Summarization: "summarize: <text>"
        # Translation: "translate English to French: <text>"
        # Question Answering: "question: <question> context: <context>"

        # For your EDA/NEDA classification task:
        input_text = f"classify: {row['input']}"
        target_text = 'EDA' if 'EDA' in str(row['output']) else 'NEDA'

        processed_data.append({
            'input_text': input_text,
            'target_text': target_text
        })

    return pd.DataFrame(processed_data)

train_df = preprocess_data(train_data)
test_df = preprocess_data(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nSample input: {train_df['input_text'].iloc[0][:100]}...")
print(f"Sample target: {train_df['target_text'].iloc[0]}")

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create validation split
split = train_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset, val_dataset = split["train"], split["test"]

print(f"\nTrain: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

# --- TOKENIZER ---
# Use the model name from the previously successful T5 attempt
model_name = "valhalla/distilt5-qg-hl-6-4"
print(f"\nLoading tokenizer: {model_name}")
# Set extra_ids=0 to prevent conflict with existing special tokens
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, extra_ids=0)

# Define tokenization parameters
max_input_length = 512
max_target_length = 128

def tokenize_fn(batch):
    """Tokenize inputs and targets"""
    # Tokenize inputs
    model_inputs = tokenizer(
        batch['input_text'],
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
    )

    # Tokenize targets
    labels = tokenizer(
        batch['target_text'],
        max_length=max_target_length,
        padding="max_length",
        truncation=True,
    )

    # Replace padding token id with -100 (ignored in loss computation)
    labels['input_ids'] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization
print("\nTokenizing datasets...")
train_dataset = train_dataset.map(tokenize_fn, batched=True, remove_columns=['input_text', 'target_text'])
val_dataset = val_dataset.map(tokenize_fn, batched=True, remove_columns=['input_text', 'target_text'])
test_dataset = test_dataset.map(tokenize_fn, batched=True, remove_columns=['input_text', 'target_text'])
print("Tokenization complete!")

# --- QLoRA CONFIGURATION ---
print("\nConfiguring QLoRA...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# --- MODEL LOADING ---
print(f"\nLoading model: {model_name}")
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Model loaded successfully!")
print(f"Model device: {model.device}")

# Prepare model for k-bit training
print("\nPreparing model for k-bit training...")
model = prepare_model_for_kbit_training(model)

# --- LoRA CONFIGURATION ---
print("\nConfiguring LoRA...")
lora_config = LoraConfig(
    r=16,  # Rank - adjust based on task complexity (8-32)
    lora_alpha=32,  # Alpha - typically 2-4x the rank
    target_modules=["q", "v"],  # Target attention layers (can add "k", "o" for more params)
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
)

# Apply LoRA
model = get_peft_model(model, lora_config)
print("\nTrainable parameters:")
model.print_trainable_parameters()

# --- DATA COLLATOR ---
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
)

# --- METRICS ---
def compute_metrics(eval_pred):
    """Compute metrics for classification"""
    predictions, labels = eval_pred

    # Replace -100 with pad_token_id
    predictions = [[token if token != -100 else tokenizer.pad_token_id for token in pred] for pred in predictions]
    labels = [[token if token != -100 else tokenizer.pad_token_id for token in label] for label in labels]

    # Decode
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean up predictions and labels
    decoded_preds = [pred.strip().upper() for pred in decoded_preds]
    decoded_labels = [label.strip().upper() for label in decoded_labels]

    # Convert to binary for classification (EDA=1, NEDA=0)
    pred_binary = [1 if 'EDA' in pred else 0 for pred in decoded_preds]
    label_binary = [1 if 'EDA' in label else 0 for label in decoded_labels]

    # Compute metrics
    accuracy = accuracy_score(label_binary, pred_binary)
    precision, recall, f1, _ = precision_recall_fscore_support(
        label_binary, pred_binary, average='binary', zero_division=0
    )

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# --- MEMORY CLEANUP CALLBACK ---
class MemoryCleanupCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        gc.collect()
        torch.cuda.empty_cache()

    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.global_step % 100 == 0:
            gc.collect()
            torch.cuda.empty_cache()

# --- TRAINING ARGUMENTS ---
print("\nSetting up training arguments...")
training_args = TrainingArguments(
    output_dir='./results_distilt5',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-4,  # Higher LR for LoRA (3e-4 to 1e-3)
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=20,  # Adjust based on dataset size
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    report_to='none',
    push_to_hub=False,
    optim='adamw_torch', # Changed optimizer back
    fp16=False, # Keep as False if using bf16
    bf16=True,  # Use bf16 with quantization
    gradient_accumulation_steps=2,  # Effective batch size = 4 * 2 = 8
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    max_grad_norm=1.0, # Keep max_grad_norm
    dataloader_pin_memory=True,
    dataloader_num_workers=2,
    warmup_steps=100,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    seed=42,
)

# --- TRAINER ---
print("\nInitializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[MemoryCleanupCallback()],
)

# --- CLEAR GPU CACHE ---
torch.cuda.empty_cache()
gc.collect()

# --- TRAIN ---
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
trainer.train()

# --- EVALUATE ON TEST SET ---
print("\n" + "="*60)
print("EVALUATING ON TEST SET")
print("="*60)
test_results = trainer.evaluate(test_dataset)
print("\nTest Results:")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")

# --- SAVE MODEL ---
print("\n" + "="*60)
print("SAVING MODEL")
print("="*60)
output_dir = "./fine_tuned_distilt5_lora"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"‚úÖ LoRA adapters saved to: {output_dir}")

# Save merged model (optional - combines LoRA weights with base model)
print("\nSaving merged model...")
try:
    merged_model = model.merge_and_unload()
    merged_output_dir = f"{output_dir}_merged"
    merged_model.save_pretrained(merged_output_dir)
    tokenizer.save_pretrained(merged_output_dir)
    print(f"‚úÖ Merged model saved to: {merged_output_dir}")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not save merged model: {e}")

print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60)

# --- INFERENCE EXAMPLE ---
print("\n" + "="*60)
print("TESTING INFERENCE")
print("="*60)

def predict(text, max_length=128):
    """Make a prediction on new text"""
    input_text = f"classify: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2,
        )

    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.strip()

# Test on samples from test set
# Using train_df for sample inputs as test_df columns were removed after tokenization
if len(train_df) > 0:
    num_samples = min(3, len(train_df))
    print(f"\nTesting on {num_samples} random samples:\n")

    # Get random indices from original train_df to get input text and target label
    sample_indices = np.random.choice(len(train_df), num_samples, replace=False)

    for i in range(num_samples):
        idx = sample_indices[i]
        # Need to use original DataFrame structure to get input and target text
        sample_input = train_df['input_text'].iloc[idx].replace("classify: ", "")
        sample_target = train_df['target_text'].iloc[idx]

        prediction = predict(sample_input)

        print(f"Sample {i+1}:")
        print(f"  Input: {sample_input[:100]}...")
        print(f"  Expected: {sample_target}")
        print(f"  Predicted: {prediction}")
        print(f"  Correct: {'‚úÖ' if prediction.upper() == sample_target.upper() else '‚ùå'}")
        print()

print("\nüéâ All done!")

Training samples: 6472
Test samples: 3373

Sample input: classify: Age:31.  Gender:female.  Diagnosis:rrms. Has not converted to SPMS. Initial presentation c...
Sample target: EDA

Train: 5177, Val: 1295, Test: 3373

Loading tokenizer: valhalla/distilt5-qg-hl-6-4


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



Tokenizing datasets...


Map:   0%|          | 0/5177 [00:00<?, ? examples/s]

Map:   0%|          | 0/1295 [00:00<?, ? examples/s]

Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

Tokenization complete!

Configuring QLoRA...

Loading model: valhalla/distilt5-qg-hl-6-4


pytorch_model.bin:   0%|          | 0.00/208M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/208M [00:00<?, ?B/s]

Model loaded successfully!
Model device: cuda:0

Preparing model for k-bit training...

Configuring LoRA...

Trainable parameters:
trainable params: 458,752 || all params: 52,560,384 || trainable%: 0.8728

Setting up training arguments...

Initializing Trainer...

STARTING TRAINING


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 17.51 GiB. GPU 0 has a total capacity of 39.56 GiB of which 17.44 GiB is free. Process 49861 has 22.11 GiB memory in use. Of the allocated memory 18.76 GiB is allocated by PyTorch, and 2.84 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)