# **IR-Electra**
 fine-tunes a pre-trained ELECTRA model for document relevance classification in Information Retrieval (IR).

Steps:
Generate Synthetic Dataset: Create query-document pairs for three categories (sports, tech, health), with labels indicating relevance (1 for relevant, 0 for non-relevant).

Preprocess Dataset: Use a custom dataset class to tokenize the query-document pairs and format them for model input.

Fine-Tune ELECTRA: Fine-tune the ELECTRA model on the dataset to classify documents as relevant or non-relevant.

Evaluate: Test the model on a separate test set and measure accuracy.

In [2]:
import random
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments

In [3]:
topics = {
    "sports": ["football", "basketball", "tennis", "match", "score"],
    "tech": ["AI", "machine learning", "cloud", "software", "algorithm"],
    "health": ["diet", "exercise", "nutrition", "wellness", "fitness"]
}

def generate_synthetic_data(num_samples=1000):
    data = []
    for _ in range(num_samples):
        topic = random.choice(list(topics.keys()))
        query = f"What is {random.choice(topics[topic])}?"
        relevant_doc = f"{topic.capitalize()} is about {', '.join(random.sample(topics[topic], 3))}. It involves {random.choice(topics[topic])}."
        other_topic = random.choice([t for t in topics.keys() if t != topic])
        non_relevant_doc = f"{other_topic.capitalize()} is about {', '.join(random.sample(topics[other_topic], 3))}. It involves {random.choice(topics[other_topic])}."
        data.append({"query": query, "document": relevant_doc, "label": 1})
        data.append({"query": query, "document": non_relevant_doc, "label": 0})
    return pd.DataFrame(data)

dataset = generate_synthetic_data(1000)


In [4]:
class IRDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data.iloc[idx]["query"]
        document = self.data.iloc[idx]["document"]
        label = self.data.iloc[idx]["label"]
        input_text = f"{query} [SEP] {document}"
        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }


In [5]:
tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
ir_dataset = IRDataset(dataset, tokenizer)

train_size = int(0.8 * len(ir_dataset))
test_size = len(ir_dataset) - train_size
train_dataset, test_dataset = random_split(ir_dataset, [train_size, test_size])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [6]:
model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-small-discriminator",
    num_labels=2
)

training_args = TrainingArguments(
    output_dir="./electra_ir",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda eval_pred: {
        "accuracy": (eval_pred.predictions.argmax(axis=1) == eval_pred.label_ids).mean()
    }
)


pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
trainer.train()

model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5267,0.276298,1.0
2,0.2089,0.117136,1.0
3,0.1191,0.088075,1.0


TrainOutput(global_step=300, training_loss=0.28488834381103517, metrics={'train_runtime': 45.356, 'train_samples_per_second': 105.829, 'train_steps_per_second': 6.614, 'total_flos': 70607137996800.0, 'train_loss': 0.28488834381103517, 'epoch': 3.0})

In [8]:
model.save_pretrained('/content/v2/electra_model')
tokenizer.save_pretrained('/content/v2/electra_model')


('/content/electra_model/tokenizer_config.json',
 '/content/electra_model/special_tokens_map.json',
 '/content/electra_model/vocab.txt',
 '/content/electra_model/added_tokens.json')

In [10]:
import shutil

shutil.make_archive('/content/electra_model', 'zip', '/content/electra_model')


'/content/electra_model.zip'

In [11]:
from google.colab import files

files.download('/content/electra_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")


Test Accuracy: 1.0000


In [None]:
def predict_relevance(query, document, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.eval()

    input_text = f"{query} [SEP] {document}"
    encoding = tokenizer(
        input_text,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    model = model.to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return "Relevant" if predicted_class == 1 else "Non-relevant"


In [None]:
query = "What is football?"
document = "	Sports is about football, basketball, score. It involves score. "
print(predict_relevance(query, document, model, tokenizer))

Relevant


In [None]:
query = "What is football?"
document = "Tech is about machine learning, cloud, software. It involves software.   "
print(predict_relevance(query, document, model, tokenizer))

Non-relevant
