In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../Data/questions_table.csv")
data.columns

Index(['question', 'answer', 'answer_type', 'derivation'], dtype='object')

In [3]:
dataset = data[data["answer_type"].isin(["span", "arithmetic"])]

In [4]:
dataset_n = dataset[["question", "answer_type"]].copy()

In [5]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
print(torch.cuda.is_available())

True


In [7]:
dataset_n['answer_type'] = dataset_n['answer_type'].astype('category').cat.codes  # Label encoding

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    dataset_n['question'].tolist(), dataset_n['answer_type'].tolist(), test_size=0.3, random_state=42
)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
class QADataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=32):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], truncation=True, padding='max_length',
            max_length=self.max_length, return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [10]:
# Create datasets
train_dataset = QADataset(X_train, y_train, tokenizer)
test_dataset = QADataset(X_test, y_test, tokenizer)

In [11]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

Batch_size = 16

In [12]:
# Load pre-trained BERT model
num_labels = len(set(y_train))
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()



In [15]:
# Training loop
epochs = 1
model.train()
for epoch in range(epochs):
    total_loss, correct = 0, 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.logits.argmax(dim=-1) == labels).sum().item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, Accuracy: {correct/len(train_dataset):.4f}")

# Evaluation
model.eval()
y_pred, y_true = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        y_pred.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
        y_true.extend(labels.cpu().numpy())

accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Epoch 1, Loss: 0.0364, Accuracy: 0.9892
Test Accuracy: 0.9568


In [18]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
model.push_to_hub("rahul14/span-arithmetic-classification")
tokenizer.push_to_hub("rahul14/span-arithmetic-classification")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/rahul14/span-arithmetic-classification/commit/45d8bcdb09768fb15207f845161367ba37359b77', commit_message='Upload tokenizer', commit_description='', oid='45d8bcdb09768fb15207f845161367ba37359b77', pr_url=None, pr_revision=None, pr_num=None)

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

Batch_size = 32

In [19]:
# DataLoaders
train_loader_2 = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader_2 = DataLoader(test_dataset, batch_size=32)

In [20]:
# Load pre-trained BERT model
num_labels = len(set(y_train))
model_2 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_2.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [21]:
# Optimizer and loss function
optimizer = AdamW(model_2.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()



In [22]:
# Training loop
epochs = 5
model_2.train()
for epoch in range(epochs):
    total_loss, correct = 0, 0
    for batch in train_loader_2:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model_2(**inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.logits.argmax(dim=-1) == labels).sum().item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader_2):.4f}, Accuracy: {correct/len(train_dataset):.4f}")

# Evaluation
model_2.eval()
y_pred, y_true = [], []
with torch.no_grad():
    for batch in test_loader_2:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model_2(**inputs)
        y_pred.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
        y_true.extend(labels.cpu().numpy())

accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Epoch 1, Loss: 0.2033, Accuracy: 0.9283
Epoch 2, Loss: 0.0999, Accuracy: 0.9680
Epoch 3, Loss: 0.0738, Accuracy: 0.9760
Epoch 4, Loss: 0.0542, Accuracy: 0.9826
Epoch 5, Loss: 0.0461, Accuracy: 0.9854
Test Accuracy: 0.9657


In [None]:
def predict_query(model, query, tokenizer, device):
    # Make sure model is in evaluation mode
    model.eval()
    
    # Prepare the input text
    # Tokenize the input text and convert to tensor
    inputs = tokenizer(
        query,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    # Move inputs to device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = outputs.logits.argmax(dim=-1)
        
    return prediction.item()

In [53]:

# Example usage:
custom_query = "What is the 2019 average defined benefit schemes?"
prediction = predict_query(model_2, custom_query, tokenizer, device)

In [54]:
def pred_label(pred):
    if pred == 0:
        return "Arithmetic"
    return "Span"

In [55]:
pred_label(prediction)

'Arithmetic'

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load model once
model_name = "rahul14/span-arithmetic-classification"
model = None
tokenizer = None

def load_model():
    global model, tokenizer
    if model is None or tokenizer is None:
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model.eval()
    return model, tokenizer

def predict_query(query):
    # Get the already-loaded model and tokenizer
    model, tokenizer = load_model()
    
    # Tokenize the input text and convert to tensor
    inputs = tokenizer(
        query,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = outputs.logits.argmax(dim=-1)
        
    return prediction.item()

def pred_label(pred):
    if pred == 0:
        return "Arithmetic"
    return "Span"

In [8]:
# Example usage
query = "What is the 2019 average defined benefit schemes?"
prediction = predict_query(query)
label = pred_label(prediction)
print(f"Prediction: {prediction}, Label: {label}")

Prediction: 0, Label: Arithmetic


In [9]:
# Example usage
query = "What is the percentage change in net sales from Frozen Kefir between 2018 and 2019?"
prediction = predict_query(query)
label = pred_label(prediction)
print(f"Prediction: {prediction}, Label: {label}")

Prediction: 0, Label: Arithmetic
