# YCITE: Selecting the best model and training it with LIME

This notebook consist of code that are used to select the best model and retrain it with LIME.

### The next section runs 5 BERT based model and evaluate their performance. 

In [None]:
# bert

import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, AutoTokenizer, AdamW, get_scheduler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', cache_dir='./local_model')

# Training parameters
lr = 2e-5
weight_decay = 0.01
EPOCHS = 10
BATCH_SIZE = 32

# Load the dataset
training_data_path = "training_data.csv"
df = pd.read_csv(training_data_path)

# Handle missing values
df['previous_sentence'] = df['previous_sentence'].fillna('N/A')
df.dropna(subset=['current_sentence', 'label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data into train (80%), validation (10%), and test (10%)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.111, random_state=42)  # 10% of 90%

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

# Define the tokenization function
def tokenize_pair(row):
    try:
        tokens = tokenizer(
            row['previous_sentence'],
            row['current_sentence'],
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return tokens['input_ids'].squeeze(0).tolist()
    except Exception as e:
        print(f"Error tokenizing pair: {e}")
        return []

# Tokenize the datasets
def prepare_data(df):
    input_ids = df.apply(tokenize_pair, axis=1).tolist()
    labels = df['label'].tolist()

    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    dataset = TensorDataset(input_ids_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=torch.cuda.is_available())

# Create DataLoaders for train, validation, and test datasets
train_loader = prepare_data(train_df)
val_loader = prepare_data(val_df)
test_loader = prepare_data(test_df)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, no_deprecation_warning=True)
total_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function with checkpoint saving
def train():
    model.train()
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        progress_bar = tqdm(train_loader, desc='Training', leave=False)
        total_loss = 0

        for batch in progress_bar:
            input_ids, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}")

        # Save checkpoint after each epoch
        checkpoint_path = f'./bert_checkpoint_epoch_{epoch + 1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved checkpoint: {checkpoint_path}")

        # Evaluate after each epoch
        evaluate(val_loader)

# Evaluation function for validation and test datasets
def evaluate(dataloader):
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, batch_labels = [x.to(device) for x in batch]

            outputs = model(input_ids)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()

            predictions.extend(batch_predictions)
            labels.extend(batch_labels.cpu().tolist())

    cm = confusion_matrix(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    print("\nConfusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Train the model and validate after each epoch
train()

# Final evaluation on the test dataset
print("Evaluating on the Test Dataset:")
evaluate(test_loader)

# Save the final fine-tuned model
torch.save(model.state_dict(), './fine_tuned_bert_model_final.pth')
print("Fine-tuned model saved successfully.")


In [None]:
# scibert

import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_scheduler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize SciBERT model and tokenizer
model_name = "allenai/scibert_scivocab_uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./local_model')
model.to(device)

# Training parameters
lr = 2e-5
weight_decay = 0.01
EPOCHS = 10
BATCH_SIZE = 32

# Load the dataset
training_data_path = "training_data.csv"
df = pd.read_csv(training_data_path)

# Handle missing values
df['previous_sentence'] = df['previous_sentence'].fillna('N/A')
df.dropna(subset=['current_sentence', 'label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data into train (80%), validation (10%), and test (10%)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.111, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

def tokenize_pair(row):
    """Tokenize a pair of sentences."""
    try:
        tokens = tokenizer(
            row['previous_sentence'],
            row['current_sentence'],
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return tokens['input_ids'].squeeze(0).tolist()
    except Exception as e:
        print(f"Error tokenizing pair: {e}")
        return []

def prepare_data(df):
    """Prepare DataLoader from tokenized data."""
    input_ids = df.apply(tokenize_pair, axis=1).tolist()
    labels = df['label'].tolist()

    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    dataset = TensorDataset(input_ids_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=torch.cuda.is_available())

# Create DataLoaders for train, validation, and test datasets
train_loader = prepare_data(train_df)
val_loader = prepare_data(val_df)
test_loader = prepare_data(test_df)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, no_deprecation_warning=True)
total_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train():
    """Train the model and save checkpoints."""
    model.train()
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        progress_bar = tqdm(train_loader, desc='Training', leave=False)
        total_loss = 0

        for batch in progress_bar:
            input_ids, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}")

        # Save checkpoint after each epoch
        checkpoint_path = f'./scibert_checkpoint_epoch_{epoch + 1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved checkpoint: {checkpoint_path}")

        evaluate(val_loader)

def evaluate(dataloader):
    """Evaluate the model on a given dataset."""
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, batch_labels = [x.to(device) for x in batch]

            outputs = model(input_ids)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()

            predictions.extend(batch_predictions)
            labels.extend(batch_labels.cpu().tolist())

    cm = confusion_matrix(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    print("\nConfusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Train the model and validate after each epoch
train()

# Final evaluation on the test dataset
print("Evaluating on the Test Dataset:")
evaluate(test_loader)

# Save the final fine-tuned model
final_model_path = './scibert_fine_tuned_model.pth'
torch.save(model.state_dict(), final_model_path)
print(f"Final model saved successfully at: {final_model_path}")


In [None]:
# distillbert

import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_scheduler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./local_model')
model.to(device)

# Training parameters
lr = 2e-5
weight_decay = 0.01
EPOCHS = 10
BATCH_SIZE = 32

# Load the dataset
training_data_path = "training_data.csv"
df = pd.read_csv(training_data_path)

# Handle missing values
df['previous_sentence'] = df['previous_sentence'].fillna('N/A')
df.dropna(subset=['current_sentence', 'label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data into train (80%), validation (10%), and test (10%)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.111, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

def tokenize_pair(row):
    """Tokenize a pair of sentences."""
    try:
        tokens = tokenizer(
            row['previous_sentence'],
            row['current_sentence'],
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return tokens['input_ids'].squeeze(0).tolist()
    except Exception as e:
        print(f"Error tokenizing pair: {e}")
        return []

def prepare_data(df):
    """Prepare DataLoader from tokenized data."""
    input_ids = df.apply(tokenize_pair, axis=1).tolist()
    labels = df['label'].tolist()

    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    dataset = TensorDataset(input_ids_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=torch.cuda.is_available())

# Create DataLoaders for train, validation, and test datasets
train_loader = prepare_data(train_df)
val_loader = prepare_data(val_df)
test_loader = prepare_data(test_df)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, no_deprecation_warning=True)
total_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train():
    """Train the model and save checkpoints."""
    model.train()
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        progress_bar = tqdm(train_loader, desc='Training', leave=False)
        total_loss = 0

        for batch in progress_bar:
            input_ids, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}")

        # Save checkpoint after each epoch
        checkpoint_path = f'./distilbert_checkpoint_epoch_{epoch + 1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved checkpoint: {checkpoint_path}")

        evaluate(val_loader)

def evaluate(dataloader):
    """Evaluate the model on a given dataset."""
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, batch_labels = [x.to(device) for x in batch]

            outputs = model(input_ids)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()

            predictions.extend(batch_predictions)
            labels.extend(batch_labels.cpu().tolist())

    cm = confusion_matrix(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    print("\nConfusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Train the model and validate after each epoch
train()

# Final evaluation on the test dataset
print("Evaluating on the Test Dataset:")
evaluate(test_loader)

# Save the final fine-tuned model
final_model_path = './distilbert_fine_tuned_model.pth'
torch.save(model.state_dict(), final_model_path)
print(f"Final model saved successfully at: {final_model_path}")


In [None]:
# roberta

import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_scheduler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize RoBERTa model and tokenizer
model_name = "roberta-base"  # You can also use 'roberta-large'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./local_model')
model.to(device)

# Training parameters
lr = 2e-5
weight_decay = 0.01
EPOCHS = 10
BATCH_SIZE = 32

# Load the dataset
training_data_path = "training_data.csv"
df = pd.read_csv(training_data_path)

# Handle missing values
df['previous_sentence'] = df['previous_sentence'].fillna('N/A')
df.dropna(subset=['current_sentence', 'label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data into train (80%), validation (10%), and test (10%)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.111, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

def tokenize_pair(row):
    """Tokenize a pair of sentences."""
    try:
        tokens = tokenizer(
            row['previous_sentence'],
            row['current_sentence'],
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return tokens['input_ids'].squeeze(0).tolist()
    except Exception as e:
        print(f"Error tokenizing pair: {e}")
        return []

def prepare_data(df):
    """Prepare DataLoader from tokenized data."""
    input_ids = df.apply(tokenize_pair, axis=1).tolist()
    labels = df['label'].tolist()

    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    dataset = TensorDataset(input_ids_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=torch.cuda.is_available())

# Create DataLoaders for train, validation, and test datasets
train_loader = prepare_data(train_df)
val_loader = prepare_data(val_df)
test_loader = prepare_data(test_df)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, no_deprecation_warning=True)
total_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train():
    """Train the model and save checkpoints."""
    model.train()
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        progress_bar = tqdm(train_loader, desc='Training', leave=False)
        total_loss = 0

        for batch in progress_bar:
            input_ids, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}")

        # Save checkpoint after each epoch
        checkpoint_path = f'./roberta_checkpoint_epoch_{epoch + 1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved checkpoint: {checkpoint_path}")

        evaluate(val_loader)

def evaluate(dataloader):
    """Evaluate the model on a given dataset."""
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, batch_labels = [x.to(device) for x in batch]

            outputs = model(input_ids)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()

            predictions.extend(batch_predictions)
            labels.extend(batch_labels.cpu().tolist())

    cm = confusion_matrix(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    print("\nConfusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Train the model and validate after each epoch
train()

# Final evaluation on the test dataset
print("Evaluating on the Test Dataset:")
evaluate(test_loader)

# Save the final fine-tuned model
final_model_path = './roberta_fine_tuned_model.pth'
torch.save(model.state_dict(), final_model_path)
print(f"Final model saved successfully at: {final_model_path}")


In [None]:
# biobert

import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_scheduler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize BioBERT model and tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./local_model')
model.to(device)

# Training parameters
lr = 2e-5
weight_decay = 0.01
EPOCHS = 10
BATCH_SIZE = 32

# Load the dataset
training_data_path = "training_data.csv"
df = pd.read_csv(training_data_path)

# Handle missing values
df['previous_sentence'] = df['previous_sentence'].fillna('N/A')
df.dropna(subset=['current_sentence', 'label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data into train (80%), validation (10%), and test (10%)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.111, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

def tokenize_pair(row):
    """Tokenize a pair of sentences."""
    try:
        tokens = tokenizer(
            row['previous_sentence'],
            row['current_sentence'],
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return tokens['input_ids'].squeeze(0).tolist()
    except Exception as e:
        print(f"Error tokenizing pair: {e}")
        return []

def prepare_data(df):
    """Prepare DataLoader from tokenized data."""
    input_ids = df.apply(tokenize_pair, axis=1).tolist()
    labels = df['label'].tolist()

    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    dataset = TensorDataset(input_ids_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=torch.cuda.is_available())

# Create DataLoaders for train, validation, and test datasets
train_loader = prepare_data(train_df)
val_loader = prepare_data(val_df)
test_loader = prepare_data(test_df)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, no_deprecation_warning=True)
total_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train():
    """Train the model and save checkpoints."""
    model.train()
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        progress_bar = tqdm(train_loader, desc='Training', leave=False)
        total_loss = 0

        for batch in progress_bar:
            input_ids, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}")

        # Save checkpoint after each epoch
        checkpoint_path = f'./biobert_checkpoint_epoch_{epoch + 1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved checkpoint: {checkpoint_path}")

        evaluate(val_loader)

def evaluate(dataloader):
    """Evaluate the model on a given dataset."""
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, batch_labels = [x.to(device) for x in batch]

            outputs = model(input_ids)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()

            predictions.extend(batch_predictions)
            labels.extend(batch_labels.cpu().tolist())

    cm = confusion_matrix(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    print("\nConfusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Train the model and validate after each epoch
train()

# Final evaluation on the test dataset
print("Evaluating on the Test Dataset:")
evaluate(test_loader)

# Save the final fine-tuned model
final_model_path = './biobert_fine_tuned_model.pth'
torch.save(model.state_dict(), final_model_path)
print(f"Final model saved successfully at: {final_model_path}")


In [None]:
# DeBerta

import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_scheduler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize DeBERTa model and tokenizer
model_name = "microsoft/deberta-v3-base"  # You can use 'microsoft/deberta-v2-xlarge' if needed
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./local_model')
model.to(device)

# Training parameters
lr = 2e-5
weight_decay = 0.01
EPOCHS = 10
BATCH_SIZE = 32

# Load the dataset
training_data_path = "training_data.csv"
df = pd.read_csv(training_data_path)

# Handle missing values
df['previous_sentence'] = df['previous_sentence'].fillna('N/A')
df.dropna(subset=['current_sentence', 'label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data into train (80%), validation (10%), and test (10%)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.111, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

def tokenize_pair(row):
    """Tokenize a pair of sentences."""
    try:
        tokens = tokenizer(
            row['previous_sentence'],
            row['current_sentence'],
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return tokens['input_ids'].squeeze(0).tolist()
    except Exception as e:
        print(f"Error tokenizing pair: {e}")
        return []

def prepare_data(df):
    """Prepare DataLoader from tokenized data."""
    input_ids = df.apply(tokenize_pair, axis=1).tolist()
    labels = df['label'].tolist()

    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    dataset = TensorDataset(input_ids_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=torch.cuda.is_available())

# Create DataLoaders for train, validation, and test datasets
train_loader = prepare_data(train_df)
val_loader = prepare_data(val_df)
test_loader = prepare_data(test_df)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, no_deprecation_warning=True)
total_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train():
    """Train the model and save checkpoints."""
    model.train()
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        progress_bar = tqdm(train_loader, desc='Training', leave=False)
        total_loss = 0

        for batch in progress_bar:
            input_ids, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}")

        # Save checkpoint after each epoch
        checkpoint_path = f'./deberta_checkpoint_epoch_{epoch + 1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved checkpoint: {checkpoint_path}")

        evaluate(val_loader)

def evaluate(dataloader):
    """Evaluate the model on a given dataset."""
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, batch_labels = [x.to(device) for x in batch]

            outputs = model(input_ids)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()

            predictions.extend(batch_predictions)
            labels.extend(batch_labels.cpu().tolist())

    cm = confusion_matrix(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    print("\nConfusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Train the model and validate after each epoch
train()

# Final evaluation on the test dataset
print("Evaluating on the Test Dataset:")
evaluate(test_loader)

# Save the final fine-tuned model
final_model_path = './deberta_fine_tuned_model.pth'
torch.save(model.state_dict(), final_model_path)
print(f"Final model saved successfully at: {final_model_path}")


### After selecting SciBERT as the model, the next section retrains the model (SciBERT) with LIME. 

In [None]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_scheduler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from lime.lime_text import LimeTextExplainer
import numpy as np

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize SciBERT model and tokenizer
model_name = "allenai/scibert_scivocab_uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./local_model')
model.to(device)

# Training parameters
lr = 2e-5
weight_decay = 0.01
EPOCHS = 4
BATCH_SIZE = 32

# Load the dataset
training_data_path = "training_data.csv"
df = pd.read_csv(training_data_path)

# Handle missing values
df['previous_sentence'] = df['previous_sentence'].fillna('N/A')
df.dropna(subset=['current_sentence', 'label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data into train (80%), validation (10%), and test (10%)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.111, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

def tokenize_pair(row):
    """Tokenize a pair of sentences."""
    try:
        tokens = tokenizer(
            row['previous_sentence'],
            row['current_sentence'],
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return tokens['input_ids'].squeeze(0).tolist()
    except Exception as e:
        print(f"Error tokenizing pair: {e}")
        return []

def prepare_data(df):
    """Prepare DataLoader from tokenized data."""
    input_ids = df.apply(tokenize_pair, axis=1).tolist()
    labels = df['label'].tolist()

    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    dataset = TensorDataset(input_ids_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=torch.cuda.is_available())

# Create DataLoaders for train, validation, and test datasets
train_loader = prepare_data(train_df)
val_loader = prepare_data(val_df)
test_loader = prepare_data(test_df)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
total_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train():
    """Train the model and save checkpoints."""
    model.train()
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        progress_bar = tqdm(train_loader, desc='Training', leave=False)
        total_loss = 0

        for batch in progress_bar:
            input_ids, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}")

        # Save checkpoint after each epoch
        checkpoint_path = f'./scibert_LIME_checkpoint_epoch_{epoch + 1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved checkpoint: {checkpoint_path}")

        evaluate(val_loader)

def evaluate(dataloader):
    """Evaluate the model on a given dataset."""
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, batch_labels = [x.to(device) for x in batch]

            outputs = model(input_ids)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()

            predictions.extend(batch_predictions)
            labels.extend(batch_labels.cpu().tolist())

    cm = confusion_matrix(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    print("\nConfusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Train the model and validate after each epoch
train()

# Final evaluation on the test dataset
print("Evaluating on the Test Dataset:")
evaluate(test_loader)

# Save the final fine-tuned model
final_model_path = './scibert_fine_tuned_model_LIME.pth'
torch.save(model.state_dict(), final_model_path)
print(f"Final model saved successfully at: {final_model_path}")

# LIME Integration
def predict_proba(texts):
    """Get model probabilities for given texts."""
    model.eval()
    tokens = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model(**tokens)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    return probs.cpu().numpy()

# Initialize LIME explainer
explainer = LimeTextExplainer(class_names=['No Citation Needed', 'Citation Needed'])

# Sample a sentence for explanation
sample_text = test_df['current_sentence'].iloc[0]  # Choose any test sentence

# Generate explanation
explanation = explainer.explain_instance(
    sample_text,  # The text instance to explain
    predict_proba,  # Prediction function
    num_features=10,  # Number of words to highlight in explanation
    num_samples=500  # Number of perturbations to generate
)

# Display explanation
explanation.show_in_notebook(text=True)
