In [49]:
from transformers import BartTokenizer, BartForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import random
import nltk
from nltk.corpus import wordnet

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
data_labled = pd.read_csv(r"https://raw.githubusercontent.com/MohammadWaleed339/bert-for-classification/refs/heads/master/labeled_traning_data.csv", index_col = False)
data_labled['text'] = data_labled.apply(lambda row: f"[CLS] {row['article1']} [SEP] {row['article2']} [SEP]", axis=1)
data_labled = data_labled[['text', 'real_text_id']]
data_labled.columns = ['text', 'labels']

In [4]:
data_labled['labels'] = data_labled['labels'] - 1   #bcz target must be 0-1 not 1-2 so minus 1

In [5]:
data_labled

Unnamed: 0,text,labels
0,[CLS] The VIRSA (Visible Infrared Survey Teles...,0
1,[CLS] China\nThe goal of this project involves...,1
2,[CLS] Scientists can learn about how galaxies ...,0
3,[CLS] China\nThe study suggests that multiple ...,1
4,[CLS] Dinosaur Rex was excited about his new t...,1
...,...,...
90,[CLS] A main focus of modern cosmology is to u...,1
91,"[CLS] APEX, as its name suggests, serves as a ...",0
92,[CLS] FORS1 and FORS2 are early instruments of...,1
93,[CLS] The observations of the Pluto-Charon sys...,1


In [6]:
data_labled2 = pd.read_csv(r"https://raw.githubusercontent.com/MohammadWaleed339/bert-for-classification/refs/heads/master/labeled_traning_data.csv", index_col = False)
data_labled2['text'] = data_labled2.apply(lambda row: f"[CLS] { row['article2']} [SEP] {row['article1']} [SEP]", axis=1)
data_labled2 = data_labled2.drop(columns = ['article1', 'article2', 'folder_id'])

In [7]:
data_labled2['labels'] = data_labled2['real_text_id'].map(lambda x: 0 if x == 2 else 1)
data_labled2 = data_labled2.drop(columns = ['real_text_id']) 

In [8]:
data_labled2

Unnamed: 0,text,labels
0,[CLS] The China relay network has released a s...,1
1,[CLS] The project aims to achieve an accuracy ...,0
2,[CLS] Dinosaur eggshells offer clues about wha...,1
3,[CLS] The importance for understanding how sta...,0
4,[CLS] Analyzing how fast stars rotate within a...,0
...,...,...
90,[CLS] A key focus of modern cosmology is to un...,0
91,"[CLS] APEX, as its name suggests, serves as a ...",1
92,[CLS] FORS1 and FORS2 are early instruments of...,0
93,[CLS] The observations of the Pluto-Charon bin...,0


In [9]:
data_labled = pd.concat([data_labled, data_labled2], axis = 0)
data_labled

Unnamed: 0,text,labels
0,[CLS] The VIRSA (Visible Infrared Survey Teles...,0
1,[CLS] China\nThe goal of this project involves...,1
2,[CLS] Scientists can learn about how galaxies ...,0
3,[CLS] China\nThe study suggests that multiple ...,1
4,[CLS] Dinosaur Rex was excited about his new t...,1
...,...,...
90,[CLS] A key focus of modern cosmology is to un...,0
91,"[CLS] APEX, as its name suggests, serves as a ...",1
92,[CLS] FORS1 and FORS2 are early instruments of...,0
93,[CLS] The observations of the Pluto-Charon bin...,0


In [10]:
dataset = Dataset.from_pandas(data_labled)

In [11]:
dataset = dataset.train_test_split(test_size=0.3, seed=42)

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 133
    })
    test: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 57
    })
})

# **Using sciBert from here** 

In [13]:
# Load SciBERT
MODEL_NAME = "allenai/scibert_scivocab_uncased"
scibert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_scibert = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Trying data augmentation

In [14]:
# Download WordNet if not already
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
def synonym_replacement(sentence, n=1):
    """
    Replace n words in the sentence with their synonyms.
    
    Args:
        sentence (str): Input text
        n (int): Number of words to replace
    
    Returns:
        str: Augmented text
    """
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    
    if len(random_word_list) == 0:
        return sentence  # No replaceable words found
    
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if not synonyms:
            continue
        # get synonym lemma
        synonym = synonyms[0].lemmas()[0].name()
        if synonym != random_word:  # avoid identical replacement
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    
    return ' '.join(new_words)

In [16]:
text = 'the quick brown fox jumps over a lazy dog who is from the family of German sheperd'
synonym_replacement(text, n=2)

'the quick brown fox jumps over angstrom lazy dog who be from the family of German sheperd'

In [45]:
# Split into train/test
train_df, test_df = train_test_split(data_labled, test_size=0.3, random_state=47)

In [18]:
# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [19]:
# Tokenization function
def tokenize_function(example):
    # Tokenizer will handle [SEP] inside the text automatically
    return scibert_tokenizer(example["text"], truncation=True, padding="max_length", max_length = 512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

In [20]:
# Set format for PyTorch
train_dataset = train_dataset.remove_columns(["text", "__index_level_0__", "token_type_ids"])
test_dataset = test_dataset.remove_columns(["text", "__index_level_0__", "token_type_ids"])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [21]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 133
})

In [22]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./scibert-classifier",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=9,
    per_device_eval_batch_size=9,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50
)

# Trainer
trainer = Trainer(
    model=model_scibert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [23]:
# Train model
trainer.train()

wandb: Currently logged in as: mohammadwaleed339 (mohammadwaleed339-aligarh-muslim-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Step,Training Loss


TrainOutput(global_step=45, training_loss=0.6312832302517362, metrics={'train_runtime': 50.6815, 'train_samples_per_second': 7.873, 'train_steps_per_second': 0.888, 'total_flos': 104981311088640.0, 'train_loss': 0.6312832302517362, 'epoch': 3.0})

In [24]:
# Evaluate
results = trainer.evaluate()
print("Evaluation:", results)

Evaluation: {'eval_loss': 0.5738421082496643, 'eval_runtime': 1.9273, 'eval_samples_per_second': 29.576, 'eval_steps_per_second': 3.632, 'epoch': 3.0}


In [28]:
import torch
from sklearn.metrics import accuracy_score

def evaluate_scibert(model, tokenizer, texts, labels, max_length=512, device=None):
    """
    Predicts labels with SciBERT and computes accuracy.
    
    Args:
        model: Trained SciBERT model (AutoModelForSequenceClassification).
        tokenizer: SciBERT tokenizer.
        texts: List of strings (each string = "text1 [SEP] text2").
        labels: List or tensor of true labels (0/1).
        max_length: Max token length (default=512).
        device: "cuda" or "cpu".
        
    Returns:
        accuracy: float, prediction accuracy
        preds: list of predicted labels
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    model.eval()

    preds = []
    with torch.no_grad():
        for text in texts:
            # Tokenize
            inputs = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=max_length
            )
            # Move to device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Forward pass
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=-1).cpu().item()
            
            preds.append(pred)  
    # Calculate accuracy
    acc = accuracy_score(labels, preds)
    if isinstance(labels, torch.Tensor):
        labels = list(labels)

    correct = sum(v == u for v, u in zip(labels, preds))
    n_total = len(labels)
    print(f"out of {n_total} total {correct} are correct")
    return acc
    

In [46]:
# test accuracy
evaluate_scibert(model_scibert, scibert_tokenizer, test_df['text'], test_df['labels'], max_length=512, device=None)

out of 57 total 43 are correct


0.7543859649122807

In [50]:
# evaluate on train data to see if the models is over fitting.
evaluate_scibert(model_scibert, scibert_tokenizer, train_df['text'], train_df['labels'], max_length=512, device=None)

out of 133 total 100 are correct


0.7518796992481203

In [51]:
! git init

Reinitialized existing Git repository in C:/Users/moham/Jupyter_files/.git/


In [None]:
! git remote add origin 