<a href="https://colab.research.google.com/github/MHHamdan/LLM_Reasoning/blob/main/arena_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd

# File paths
train_path = "/content/drive/MyDrive/GenerativeAI/wsdm-cup-multilingual-chatbot-arena/train.parquet"
test_path = "/content/drive/MyDrive/GenerativeAI/wsdm-cup-multilingual-chatbot-arena/test.parquet"

# Load datasets
train = pd.read_parquet(train_path)
test = pd.read_parquet(test_path)

print("Train shape:", train.shape)
print("Test shape:", test.shape)
#train.head()


Train shape: (48439, 8)
Test shape: (3, 5)


In [4]:
# Check for missing values
print(train.isnull().sum())

# Analyze text lengths
train['prompt_length'] = train['prompt'].apply(lambda x: len(x.split()))
train['response_a_length'] = train['response_a'].apply(lambda x: len(x.split()))
train['response_b_length'] = train['response_b'].apply(lambda x: len(x.split()))
train[['prompt_length', 'response_a_length', 'response_b_length']].describe()


id            0
prompt        0
response_a    0
response_b    0
winner        0
model_a       0
model_b       0
language      0
dtype: int64


Unnamed: 0,prompt_length,response_a_length,response_b_length
count,48439.0,48439.0,48439.0
mean,121.658808,293.452714,294.143294
std,364.760289,276.27887,275.31928
min,0.0,1.0,1.0
25%,9.0,94.0,94.0
50%,22.0,241.0,242.0
75%,73.0,419.0,421.0
max,7160.0,6476.0,6061.0


In [6]:
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/GenerativeAI/bert-base-multilingual-cased")


In [7]:
def tokenize_and_chunk(text, tokenizer, max_length=512):
    """
    Tokenizes the text and splits it into chunks of `max_length` tokens.
    Returns the number of tokens and the tokenized chunks.
    """
    try:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        if len(tokens) > max_length:
            # Split tokens into chunks
            chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
            return len(tokens), chunks
        else:
            return len(tokens), [tokens]  # Return as a single chunk if within limit
    except Exception as e:
        print(f"Error tokenizing text: {text[:50]}... -> {e}")
        return 0, []  # Return 0 tokens and empty chunks if there's an error


In [8]:
# Apply tokenization with chunking
train['prompt_tokens'], train['prompt_chunks'] = zip(
    *train['prompt'].apply(lambda x: tokenize_and_chunk(x, tokenizer))
)
train['response_a_tokens'], train['response_a_chunks'] = zip(
    *train['response_a'].apply(lambda x: tokenize_and_chunk(x, tokenizer))
)
train['response_b_tokens'], train['response_b_chunks'] = zip(
    *train['response_b'].apply(lambda x: tokenize_and_chunk(x, tokenizer))
)

# Check results
print(train[['prompt_tokens', 'response_a_tokens', 'response_b_tokens']].describe())


Token indices sequence length is longer than the specified maximum sequence length for this model (694 > 512). Running this sequence through the model will result in indexing errors


       prompt_tokens  response_a_tokens  response_b_tokens
count   48439.000000       48439.000000       48439.000000
mean      278.250046         624.627882         624.449245
std       892.652223         569.023964         555.217057
min         2.000000           3.000000           2.000000
25%        20.000000         249.000000         250.000000
50%        46.000000         521.000000         522.000000
75%       150.000000         843.000000         845.000000
max     29875.000000       17174.000000       12604.000000


In [9]:
import torch

def process_chunks(chunks, model, tokenizer):
    """
    Processes tokenized chunks through the model and aggregates the results.
    """
    results = []
    for chunk in chunks:
        # Convert tokenized chunk to tensors
        inputs = {
            "input_ids": torch.tensor([chunk]),  # Add batch dimension
            "attention_mask": torch.tensor([[1] * len(chunk)])  # Attention mask for valid tokens
        }

        # Ensure inputs are on the correct device
        model_device = next(model.parameters()).device
        inputs = {k: v.to(model_device) for k, v in inputs.items()}

        # Run the model
        outputs = model(**inputs)
        results.append(outputs.logits.detach().cpu().numpy())

    # Aggregate results (e.g., mean pooling)
    aggregated_result = sum(results) / len(results)
    return aggregated_result


In [11]:
from transformers import AutoModelForSequenceClassification

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/GenerativeAI/bert-base-multilingual-cased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/GenerativeAI/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Process chunks and get predictions
train['response_a_logits'] = train['response_a_chunks'].apply(lambda chunks: process_chunks(chunks, model, tokenizer))
train['response_b_logits'] = train['response_b_chunks'].apply(lambda chunks: process_chunks(chunks, model, tokenizer))

# Check the results
print(train[['response_a_logits', 'response_b_logits']].head())


In [None]:
import numpy as np

# Determine the predicted winner based on logits
def predict_winner(logits_a, logits_b):
    """
    Compares the logits for response_a and response_b to predict the winner.
    """
    prob_a = np.exp(logits_a[0][1]) / (np.exp(logits_a[0][1]) + np.exp(logits_b[0][1]))
    prob_b = 1 - prob_a
    return "model_a" if prob_a > prob_b else "model_b"

# Apply prediction to the dataset
train['predicted_winner'] = train.apply(
    lambda row: predict_winner(row['response_a_logits'], row['response_b_logits']), axis=1
)

# Inspect results
print(train[['response_a_logits', 'response_b_logits', 'predicted_winner']].head())


In [None]:
# Calculate accuracy
accuracy = (train['predicted_winner'] == train['winner']).mean()
print(f"Training Accuracy: {accuracy:.2%}")


In [None]:
# Tokenize and chunk the test data
test['prompt_tokens'], test['prompt_chunks'] = zip(
    *test['prompt'].apply(lambda x: tokenize_and_chunk(x, tokenizer))
)
test['response_a_tokens'], test['response_a_chunks'] = zip(
    *test['response_a'].apply(lambda x: tokenize_and_chunk(x, tokenizer))
)
test['response_b_tokens'], test['response_b_chunks'] = zip(
    *test['response_b'].apply(lambda x: tokenize_and_chunk(x, tokenizer))
)

# Verify the new columns
print(test[['prompt_tokens', 'response_a_tokens', 'response_b_tokens']].head())


In [None]:
# Compute logits for test data
test['response_a_logits'] = test['response_a_chunks'].apply(lambda chunks: process_chunks(chunks, model, tokenizer))
test['response_b_logits'] = test['response_b_chunks'].apply(lambda chunks: process_chunks(chunks, model, tokenizer))

# Predict winners for the test set
test['predicted_winner'] = test.apply(
    lambda row: predict_winner(row['response_a_logits'], row['response_b_logits']), axis=1
)


In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=2e-5,
)

# Prepare the dataset for training
train_encodings = tokenizer(list(train['prompt']), list(train['response_a']), list(train['response_b']),
                            truncation=True, padding=True, max_length=512, return_tensors="pt")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
)

# Fine-tune the model
trainer.train()
