## Imports and Libraries

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import DataCollatorWithPadding

## Dataset Loading and Preprocessing

In [2]:
# Load dataset

df = pd.read_csv(r"/kaggle/input/mit-plagairism-detection-dataset/train_snli.txt", delimiter='\t', header=None, names=['sentence1', 'sentence2', 'label'])

df.head()

Unnamed: 0,sentence1,sentence2,label
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
2,Children smiling and waving at camera,There are children present,1
3,Children smiling and waving at camera,The kids are frowning,0
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0


In [3]:
df.shape

(367373, 3)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367373 entries, 0 to 367372
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   sentence1  367373 non-null  object
 1   sentence2  367369 non-null  object
 2   label      367373 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 8.4+ MB


In [5]:
df.isna().sum()

sentence1    0
sentence2    4
label        0
dtype: int64

In [6]:
# drop rows with missing values
df.dropna(inplace=True)

In [7]:
df['label'].value_counts()

label
0    183964
1    183405
Name: count, dtype: int64

In [8]:
# Split dataset into train, validation and test sets

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

train_df.shape, val_df.shape, test_df.shape

((264505, 3), (29390, 3), (73474, 3))

# Load the model and tokenizer from hugging face Repository

In [9]:
# Load the SmolLM model and tokenizer

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M")

model = AutoModelForSequenceClassification.from_pretrained("HuggingFaceTB/SmolLM-135M", num_labels=2)

tokenizer_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM-135M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Define a custom dataset class for handling input data

class PlagiarismDataset(Dataset):
    """
    A custom dataset class for handling input data for a plagiarism detection model.

    Attributes:
    -----------
    df : pandas.DataFrame
        The dataframe containing the input data.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer to be used for encoding the sentences.
    max_length : int, optional
        The maximum length of the tokenized input sequences (default is 128).

    Methods:
    --------
    __len__():
        Returns the number of samples in the dataset.
    __getitem__(index):
        Returns a dictionary containing the tokenized input ids, attention mask, and label for the sample at the given index.
    """

    def __init__(self, df, tokenizer, max_length=128):
        """
        Initializes the PlagiarismDataset with the given dataframe, tokenizer, and maximum sequence length.

        Parameters:
        -----------
        df_ : pandas.DataFrame
            The dataframe containing the input data.
        tokenizer : transformers.PreTrainedTokenizer
            The tokenizer to be used for encoding the sentences.
        max_length : int, optional
            The maximum length of the tokenized input sequences (default is 128).
        """
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        """
        Returns the number of samples in the dataset.

        Returns:
        --------
        int
            The number of samples in the dataset.
        """
        return len(self.df)

    def __getitem__(self, index):
        """
        Returns a dictionary containing the tokenized input ids, attention mask, and label for the sample at the given index.

        Parameters:
        -----------
        index : int
            The index of the sample to retrieve.

        Returns:
        --------
        dict
            A dictionary containing the tokenized input ids, attention mask, and label for the sample.
        """
        row = self.df.iloc[index]
        # Ensure the sentences are strings; convert or skip if not
        sentence1 = str(row['sentence1']) if not pd.isna(row['sentence1']) else ""
        sentence2 = str(row['sentence2']) if not pd.isna(row['sentence2']) else ""
        inputs = self.tokenizer(
            sentence1, sentence2,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        # Convert the label to a tensor
        label = torch.tensor(row['label'], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': label
        }




# You can also use transformer's DataCollatorWithPadding 
def collate_fn(batch):
    """
    A collate function to be used with a DataLoader for batching samples from the PlagiarismDataset.

    This function takes a list of samples (each sample is a dictionary containing 'input_ids', 'attention_mask', and 'label')
    and stacks them into batched tensors.

    Parameters:
    -----------
    batch : list of dict
        A list of samples, where each sample is a dictionary containing 'input_ids', 'attention_mask', and 'label'.

    Returns:
    --------
    dict
        A dictionary containing batched tensors for 'input_ids', 'attention_mask', and 'label'.
    """
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_masks = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'label': labels
    }



# Add a new padding token

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model's token embeddings to match the new tokenizer

model.resize_token_embeddings(len(tokenizer))

# Create the dataset and dataloader

train_set = PlagiarismDataset(train_df, tokenizer, max_length=192)
valid_set = PlagiarismDataset(val_df, tokenizer, max_length=192)
test_set = PlagiarismDataset(test_df, tokenizer, max_length=192)

train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_set, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Check the padding token details

print("Padding Token:", tokenizer.pad_token)

print("Padding Token ID:", tokenizer.pad_token_id)

Padding Token: [PAD]
Padding Token ID: 49152


In [11]:
# Set the pad token id in the model's config

model.config.pad_token_id = tokenizer.pad_token_id

# Training and Evaluation Functions

In [12]:
# Training function

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

model = torch.nn.DataParallel(model)



def train_model(model, train_loader, val_loader, optimizer, loss_fn, epochs=3):
    """
    Trains the given model using the provided training and validation data loaders, optimizer, and loss function.

    Parameters:
    -----------
    model : torch.nn.Module
        The model to be trained.
    train_loader : DataLoader
        DataLoader for the training dataset.
    val_loader : DataLoader
        DataLoader for the validation dataset.
    optimizer : torch.optim.Optimizer
        The optimizer to be used for training.
    loss_fn : torch.nn.Module
        The loss function to be used for training.
    epochs : int, optional
        The number of epochs to train the model (default is 3).

    Returns:
    --------
    None
    """
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        model.train()  # Ensure the model is in training mode
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}")

        # Evaluate on the validation set
        val_accuracy = evaluate_model(model, val_loader)
        print(f"Epoch {epoch+1}/{epochs}, Validation Accuracy: {val_accuracy:.4f}")
    print("Training complete!")



def evaluate_model(model, data_loader):
    """
    Evaluates the given model using the provided data loader.

    Parameters:
    -----------
    model : torch.nn.Module
        The model to be evaluated.
    data_loader : DataLoader
        DataLoader for the dataset to evaluate on.

    Returns:
    --------
    float
        The accuracy of the model on the provided dataset.
    """
    model.eval()  # Ensure the model is in evaluation mode
    correct = 0
    total = 0
    with torch.no_grad(): # Disable gradient tracking
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return accuracy

# Training Configs
- Optimizer: adamW
- loss function: Cross Entropy Loss

In [13]:
# Set up training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

loss_fn = CrossEntropyLoss()

Using device: cuda


In [14]:
# Train the model

train_model(model, train_loader, valid_loader, optimizer, loss_fn, epochs=3)

Training Epoch 1: 100%|██████████| 16532/16532 [2:13:36<00:00,  2.06it/s]
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch 1/3, Training Loss: 0.1334
Epoch 1/3, Validation Accuracy: 0.9612


Training Epoch 2: 100%|██████████| 16532/16532 [2:13:27<00:00,  2.06it/s]


Epoch 2/3, Training Loss: 0.0606
Epoch 2/3, Validation Accuracy: 0.9630


Training Epoch 3: 100%|██████████| 16532/16532 [2:13:26<00:00,  2.06it/s]


Epoch 3/3, Training Loss: 0.0254
Epoch 3/3, Validation Accuracy: 0.9605
Training complete!


# Model Evaluation on test set

In [15]:
from sklearn.metrics import classification_report


# Evaluate the model
# I'm defining a new evaluation function with metrics such as f1, precision, recall.
def evaluate_model(model, data_loader):
    """
    Evaluate the model on a dataset and compute accuracy and a classification report.
    Args:
    - model: Model to evaluate.
    - data_loader: DataLoader for evaluation data.
    """
    model.eval()  # Set model to evaluation mode
    correct = 0  # Track correct predictions
    total = 0  # Track total samples

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            labels = batch['label'].to("cuda")

            outputs = model(input_ids, attention_mask=attention_mask)  # Forward pass
            predictions = torch.argmax(outputs.logits, dim=1)  # Get predictions

            all_labels.extend(labels.cpu().numpy())  # Collect true labels
            all_predictions.extend(predictions.cpu().numpy())  # Collect predictions

            correct += (predictions == labels).sum().item()  # Count correct predictions
            total += labels.size(0)

    # Compute accuracy
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")

    # Generate classification report
    report = classification_report(all_labels, all_predictions)
    print("\nClassification Report:\n", report)

    return accuracy, report

evaluate_model(model, test_loader)

Accuracy: 0.9620

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96     36586
           1       0.97      0.96      0.96     36888

    accuracy                           0.96     73474
   macro avg       0.96      0.96      0.96     73474
weighted avg       0.96      0.96      0.96     73474



(0.9619729428097014,
 '              precision    recall  f1-score   support\n\n           0       0.96      0.97      0.96     36586\n           1       0.97      0.96      0.96     36888\n\n    accuracy                           0.96     73474\n   macro avg       0.96      0.96      0.96     73474\nweighted avg       0.96      0.96      0.96     73474\n')

# Saving model 

In [16]:
save_directory = "/kaggle/working/"

# Save the model and tokenizer to the directory
model_to_save = model.module if hasattr(model, "module") else model

# Save the fine-tuned model
model_to_save.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to /kaggle/working/
