<a href="https://colab.research.google.com/github/Mehtavaishali/AI-Plagiarism-Checker/blob/main/plagiarism_using_sbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DATA CLEANING AND PREPROCESSING

In [1]:
import pandas as pd
import re

#  Load the Dataset
def load_dataset(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                data.append(parts)
    return pd.DataFrame(data, columns=['premise', 'hypothesis', 'label'])

Clean individual text

In [2]:
def clean_text(text):
    """
    Lowercases, removes punctuation, and strips extra spaces from text.
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

Preprocess entire DataFrame

In [3]:
def preprocess_dataset(df):
    """
    Cleans premise and hypothesis columns, converts label to int, and removes duplicates.
    """
    df = df.copy()
    df['premise'] = df['premise'].apply(clean_text)
    df['hypothesis'] = df['hypothesis'].apply(clean_text)
    df['label'] = df['label'].astype(int)
    df.drop_duplicates(inplace=True)
    return df

Save cleaned dataset

In [4]:
def save_cleaned_dataset(df, output_path):
    df.to_csv(output_path, index=False)
    print(f"Cleaned dataset saved to: {output_path}")

In [5]:
file_path = 'train_snli.txt'
output_path = 'cleaned_plagiarism_dataset.csv'

df_raw = load_dataset(file_path)
df_cleaned = preprocess_dataset(df_raw)
save_cleaned_dataset(df_cleaned, output_path)

Cleaned dataset saved to: cleaned_plagiarism_dataset.csv


Import the libraries and open the cleaned data


In [6]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
df = pd.read_csv("cleaned_plagiarism_dataset.csv")
df.dropna(inplace=True)


Extract text and labels

In [7]:
premises = df["premise"].tolist()
hypotheses = df["hypothesis"].tolist()
labels = df["label"].tolist()


Load SBERT model

In [8]:
sbert_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Encoding sentences

In [None]:
premise_embeddings = sbert_model.encode(premises, convert_to_tensor=False, show_progress_bar=True)
hypothesis_embeddings = sbert_model.encode(hypotheses, convert_to_tensor=False, show_progress_bar=True)


Batches:   0%|          | 0/11462 [00:00<?, ?it/s]

Create enhanced features: concat + abs diff + product

In [None]:
print("Building enhanced feature vectors...")
features = [
    np.concatenate([p, h, np.abs(p - h), p * h])
    for p, h in zip(premise_embeddings, hypothesis_embeddings)
]


Split dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


Normalize features

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

 Train Logistic Regression

In [None]:
classifier = LogisticRegression(max_iter=2000, solver='liblinear', C=1.0)
classifier.fit(X_train, y_train)


Predict and evaluate

In [None]:
# Predict on train set
y_train_pred = classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Predict on test set
y_test_pred = classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print results
print(f"\n Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

In [None]:
print("\n Classification Report on Test Set:\n", classification_report(y_test, y_test_pred))y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# results
print(f"\n SBERT Classifier Accuracy (Optimized): {accuracy * 100:.2f}%")
print("\n Classification Report:\n", classification_report(y_test, y_pred))

Example Usage

In [None]:
sample_premise = "The student submitted the same assignment as another student."
sample_hypothesis = "The assignment submitted was identical to a peer's work."

# Encode and prepare features
p_embed = sbert_model.encode(sample_premise)
h_embed = sbert_model.encode(sample_hypothesis)
sample_feature = combine_features(p_embed, h_embed)
sample_scaled = scaler.transform([sample_feature])

# Predict
sample_pred = classifier.predict(sample_scaled)[0]
print(f"\n Sample Prediction (1 = Plagiarized, 0 = Not): {sample_pred}")