<a href="https://colab.research.google.com/github/Mehtavaishali/AI-Plagiarism-Checker/blob/main/project(svm%2BTF-IDF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

Data Cleaning

In [2]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+|\d+", "", text)        # Remove mentions and digits
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()    # Normalize spaces
    return text

Load and preprocess data

In [3]:
def load_and_prepare_data(filepath, sample_size=10000):
    df = pd.read_csv(filepath, sep='\t', header=None, names=['sentence1', 'sentence2', 'label'])
    df.dropna(inplace=True)
    df = df[df['label'].isin([0, 1])]
    df['label'] = df['label'].astype(int)
    df = df.sample(min(sample_size, len(df)), random_state=42).reset_index(drop=True)
    df['sentence1'] = df['sentence1'].apply(clean_text)
    df['sentence2'] = df['sentence2'].apply(clean_text)
    return df

Feature extraction using absolute difference of TF-IDF vectors

In [4]:
def extract_features(df):
    all_text = df['sentence1'].tolist() + df['sentence2'].tolist()
    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        stop_words='english',
        sublinear_tf=True
    )
    vectorizer.fit(all_text)
    vec1 = vectorizer.transform(df['sentence1'])
    vec2 = vectorizer.transform(df['sentence2'])
    feature_matrix = np.abs(vec1 - vec2)
    return feature_matrix, df['label'].values, vectorizer

Train and evaluate SVM model

In [5]:
def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = SVC(kernel='linear', probability=True, random_state=42)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f" Training Accuracy: {accuracy_score(y_train, y_train_pred) * 100:.2f}%")
    print(f" Testing Accuracy : {accuracy_score(y_test, y_test_pred) * 100:.2f}%")
    print("\n Classification Report:\n", classification_report(y_test, y_test_pred))

    return model

Predict on new sentence pairs

In [6]:
def predict_pair(model, vectorizer, sentence1, sentence2):
    s1 = clean_text(sentence1)
    s2 = clean_text(sentence2)
    v1 = vectorizer.transform([s1])
    v2 = vectorizer.transform([s2])
    features = np.abs(v1 - v2)
    prediction = model.predict(features)[0]
    prob = model.predict_proba(features)[0][prediction]
    label = "Plagiarised" if prediction == 1 else "Not Plagiarised"
    print(f"\n Prediction: {label} (Confidence: {prob * 100:.2f}%)")

Example Usage

In [7]:
if __name__ == "__main__":
    # Load and process the dataset
    df = load_and_prepare_data('train_snli.txt', sample_size=10000)

    # Extract features
    X, y, tfidf_vectorizer = extract_features(df)

    # Train and evaluate the model
    trained_model = train_and_evaluate(X, y)

    # Test with new sentence pair
    test_sentence1 = "The student wrote the assignment in her own words."
    test_sentence2 = "The assignment was completed in a unique way by the student."
    predict_pair(trained_model, tfidf_vectorizer, test_sentence1, test_sentence2)

 Training Accuracy: 86.20%
 Testing Accuracy : 68.35%

 Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.61      0.65       979
           1       0.67      0.75      0.71      1021

    accuracy                           0.68      2000
   macro avg       0.69      0.68      0.68      2000
weighted avg       0.69      0.68      0.68      2000


 Prediction: Plagiarised (Confidence: 67.49%)
