In [1]:
import pandas as pd
import numpy as np
import torch
import ast
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.profiler import profile, record_function, ProfilerActivity
from codecarbon import EmissionsTracker
import time
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

In [2]:
MODEL_NAME = "microsoft/deberta-v3-small"
BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print(f"Success! Training on: {torch.cuda.get_device_name(0)}")
else:
    raise RuntimeError("GPU required for Profiler.")

Success! Training on: NVIDIA GeForce RTX 2070


In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df['answers'] = train_df['answers'].apply(ast.literal_eval)
test_df['answers'] = test_df['answers'].apply(ast.literal_eval)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

class AMLDataset(Dataset):
    def __init__(self, df, tokenizer, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.is_test = is_test
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        contexts = [row['context']] * 4
        candidates = [f"{row['question']} {opt}" for opt in row['answers']]
        tokenized = self.tokenizer(contexts, candidates, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        item = {k: v for k, v in tokenized.items()}
        if not self.is_test: item['labels'] = torch.tensor(row['label'])
        return item

from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)
train_loader = DataLoader(AMLDataset(train_data, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(AMLDataset(val_data, tokenizer), batch_size=BATCH_SIZE)
test_loader = DataLoader(AMLDataset(test_df, tokenizer, is_test=True), batch_size=BATCH_SIZE)

In [4]:
def measure_empirical_flops(model, loader):
    print("\nProfiling FLOPs ")
    model.train()
    
    # Create a dummy optimizer/scaler just for this one step
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scaler = torch.amp.GradScaler('cuda')
    batch = next(iter(loader))
    
    input_ids = batch['input_ids'].to(DEVICE)
    attention_mask = batch['attention_mask'].to(DEVICE)
    labels = batch['labels'].to(DEVICE)

    # Run Profiler for 1 Step 
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        with_flops=True
    ) as prof:
        with record_function("model_train_step"):
            # Forward
            with torch.amp.autocast('cuda'):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
            # Backward
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

    # 
    # Summing up all floating point operations recorded by the kernel
    flops_per_step = sum([e.flops for e in prof.key_averages()])
    
    print(f"Measured: {flops_per_step:,.0f} FLOPs per step")
    return flops_per_step

# Init Model & Measure
model = AutoModelForMultipleChoice.from_pretrained(MODEL_NAME).to(DEVICE)
flops_per_step = measure_empirical_flops(model, train_loader)

# Extrapolate to full training
total_steps = len(train_loader) * EPOCHS
empirical_total_flops = flops_per_step * total_steps
empirical_teraflops = empirical_total_flops / 1e12

print(f"Estimated Total Work: {empirical_teraflops:.4f} TeraFLOPs")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Profiling FLOPs 
Measured: 1,249,365,480,468 FLOPs per step
Estimated Total Work: 3913.0127 TeraFLOPs


In [5]:
# Trainig and tracking energy with CodeCarbon
print("\nFull Training with Energy Tracking")

# Re-initialize model/optimizer for clean training
model = AutoModelForMultipleChoice.from_pretrained(MODEL_NAME).to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scaler = torch.amp.GradScaler('cuda')

# Start Tracker
tracker = EmissionsTracker(
    project_name="DeBERTa_Baseline_Full", 
    output_file="emissions_deberta.csv"
)
tracker.start()
start_time = time.time()

try:
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
        
        for batch in loop:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            total_loss += loss.item()
            loop.set_postfix(loss=f"{loss.item():.4f}")
            
        # Validation
        model.eval()
        val_correct = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)
                
                with torch.amp.autocast('cuda'):
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                val_correct += (torch.argmax(outputs.logits, dim=1) == labels).sum().item()
        
        print(f"Epoch {epoch+1} Val Acc: {val_correct/len(val_data):.4f}")
        tracker.flush()

finally:
    emissions = tracker.stop()
    duration = time.time() - start_time


    print("\n" + "="*40)
    print("SUSTAINABILITY Report")
    print("="*40)
    print(f"1. COMPUTATIONAL WORK (Torch Profiler)")
    print(f"   - Per Step: {flops_per_step:,.0f} FLOPs")
    print(f"   - Total:    {empirical_teraflops:.4f} TFLOPs")
    print("-" * 40)
    print(f"2. ENVIRONMENTAL IMPACT (CodeCarbon)")
    if emissions:
        print(f"Carbon:   {emissions:.6f} kg CO2")
        print(f"Energy:   {tracker.final_emissions_data.energy_consumed:.6f} kWh")
        print(f"Training Time:   {duration:.4f} seconds")
    else:
        print("   - Energy:   Run too short to measure.")
    print("="*40)


Full Training with Energy Tracking


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[codecarbon INFO @ 15:24:44] [setup] RAM Tracking...
[codecarbon INFO @ 15:24:44] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 15:24:46] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
[codecarbon INFO @ 15:24:46] [setup] GPU Tracking...
[codecarbon INFO @ 15:24:46] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 15:24:46] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: pynvml
            
[codecar

Epoch 1/3:   0%|          | 0/1044 [00:00<?, ?it/s]

[codecarbon INFO @ 15:25:05] Energy consumed for RAM : 0.000083 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 15:25:05] Delta energy consumed for CPU with constant : 0.001126 kWh, power : 270.0 W
[codecarbon INFO @ 15:25:05] Energy consumed for All CPU : 0.001126 kWh
[codecarbon INFO @ 15:25:05] Energy consumed for all GPUs : 0.000447 kWh. Total GPU Power : 107.10049612694785 W
[codecarbon INFO @ 15:25:05] 0.001656 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 15:25:20] Energy consumed for RAM : 0.000167 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 15:25:20] Delta energy consumed for CPU with constant : 0.001126 kWh, power : 270.0 W
[codecarbon INFO @ 15:25:20] Energy consumed for All CPU : 0.002252 kWh
[codecarbon INFO @ 15:25:20] Energy consumed for all GPUs : 0.000909 kWh. Total GPU Power : 110.7445405716778 W
[codecarbon INFO @ 15:25:20] 0.003328 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 15

Epoch 1 Val Acc: 0.3642


Epoch 2/3:   0%|          | 0/1044 [00:00<?, ?it/s]

[codecarbon INFO @ 15:28:20] Energy consumed for RAM : 0.001166 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 15:28:20] Delta energy consumed for CPU with constant : 0.000508 kWh, power : 270.0 W
[codecarbon INFO @ 15:28:20] Energy consumed for All CPU : 0.015751 kWh
[codecarbon INFO @ 15:28:20] Energy consumed for all GPUs : 0.006381 kWh. Total GPU Power : 108.7614609130633 W
[codecarbon INFO @ 15:28:20] 0.023299 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 15:28:35] Energy consumed for RAM : 0.001249 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 15:28:35] Delta energy consumed for CPU with constant : 0.001124 kWh, power : 270.0 W
[codecarbon INFO @ 15:28:35] Energy consumed for All CPU : 0.016875 kWh
[codecarbon INFO @ 15:28:35] Energy consumed for all GPUs : 0.006833 kWh. Total GPU Power : 108.48894660750383 W
[codecarbon INFO @ 15:28:35] 0.024957 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 15

Epoch 2 Val Acc: 0.4375


Epoch 3/3:   0%|          | 0/1044 [00:00<?, ?it/s]

[codecarbon INFO @ 15:31:50] Energy consumed for RAM : 0.002333 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 15:31:50] Delta energy consumed for CPU with constant : 0.000606 kWh, power : 270.0 W
[codecarbon INFO @ 15:31:50] Energy consumed for All CPU : 0.031506 kWh
[codecarbon INFO @ 15:31:50] Energy consumed for all GPUs : 0.012721 kWh. Total GPU Power : 107.35594164985093 W
[codecarbon INFO @ 15:31:50] 0.046560 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 15:32:05] Energy consumed for RAM : 0.002416 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 15:32:05] Delta energy consumed for CPU with constant : 0.001126 kWh, power : 270.0 W
[codecarbon INFO @ 15:32:05] Energy consumed for All CPU : 0.032632 kWh
[codecarbon INFO @ 15:32:05] Energy consumed for all GPUs : 0.013176 kWh. Total GPU Power : 109.07251508752373 W
[codecarbon INFO @ 15:32:05] 0.048224 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 1

Epoch 3 Val Acc: 0.4397

SUSTAINABILITY Report
1. COMPUTATIONAL WORK (Torch Profiler)
   - Per Step: 1,249,365,480,468 FLOPs
   - Total:    3913.0127 TFLOPs
----------------------------------------
2. ENVIRONMENTAL IMPACT (CodeCarbon)
Carbon:   0.018473 kg CO2
Energy:   0.069027 kWh
Training Time:   623.4577 seconds


In [6]:
print("Generating submission")
model.eval()
predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        with torch.amp.autocast('cuda'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

pd.DataFrame({'id': test_df['id'], 'label': predictions}).to_csv('submission_baseline_debert_final.csv', index=False)
print("Done!")

Generating submission


  0%|          | 0/125 [00:00<?, ?it/s]

Done!
