In [3]:
pip install faiss-cpu



In [4]:
import os
import logging
import pandas as pd
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

In [5]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [6]:
# Parameters
MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'
SIMILARITY_THRESHOLD = 0.9  # Threshold for determining plagiarism

In [7]:
# Preprocess text by removing punctuation, numbers, and converting to lowercase
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text.lower().strip()  # Convert to lowercase and strip whitespace

In [8]:
# Load the dataset from a CSV file
def load_dataset(file_path):
    if not os.path.exists(file_path):
        logging.error(f"Dataset file {file_path} does not exist.")
        return None
    try:
        dataset = pd.read_csv(file_path)
        # Check if the required columns exist
        if 'text' not in dataset.columns or 'filename' not in dataset.columns:
            logging.error("Dataset must have 'filename' and 'text' columns.")
            return None
        dataset['text'] = dataset['text'].apply(preprocess_text)  # Preprocess the text
        return dataset
    except Exception as e:
        logging.error(f"Error loading dataset: {e}")
        return None

In [9]:
# Find plagiarism using FAISS for fast similarity search
def find_plagiarism_faiss(filenames, embeddings):
    dimension = embeddings.shape[1]  # Get the dimension of the embeddings
    index = faiss.IndexFlatIP(dimension)  # Create a FAISS index for inner product search
    faiss.normalize_L2(embeddings)  # Normalize embeddings for cosine similarity
    index.add(embeddings)  # Add embeddings to the index

    # Perform similarity search for all documents
    _, indices = index.search(embeddings, len(filenames))

    plagiarism_results = set()
    for i, neighbors in enumerate(indices):
        for j in neighbors:
            if i < j:  # Avoid duplicate comparisons
                similarity = np.dot(embeddings[i], embeddings[j])  # Calculate cosine similarity
                if similarity >= SIMILARITY_THRESHOLD:  # Check if similarity exceeds the threshold
                    file_a = filenames[i]
                    file_b = filenames[j]
                    plagiarism_results.add((file_a, file_b, similarity))
    return plagiarism_results

In [10]:
# Save plagiarism results to a CSV file
def save_results_to_csv(results, filename="plagiarism_results.csv"):
    df = pd.DataFrame(results, columns=["File A", "File B", "Similarity Score"])
    df['Similarity Score'] = df['Similarity Score'].apply(lambda x: round(x, 2))  # Round similarity scores
    df.to_csv(filename, index=False)  # Save results to a CSV file
    logging.info(f"Results saved to {filename}.")

In [11]:
# Print plagiarism results in a readable format
def print_results(results):
    print("\nPlagiarism Results:")
    for file_a, file_b, similarity in sorted(results, key=lambda x: x[2], reverse=True):
        print(f"{file_a} and {file_b} have a similarity score of {similarity:.2f}")

In [12]:
# Main function to run the plagiarism detection
def main():
    dataset_path = "plagiarism_dataset.csv"  # Path to the dataset file
    dataset = load_dataset(dataset_path)  # Load the dataset
    if dataset is None:
        return
    filenames = dataset['filename'].tolist()  # Get the list of filenames
    documents = dataset['text'].tolist()  # Get the list of document texts

    model = SentenceTransformer(MODEL_NAME)  # Load the pre-trained model
    embeddings = model.encode(documents, convert_to_tensor=False)  # Encode documents into embeddings

    plagiarism_results = find_plagiarism_faiss(filenames, embeddings)  # Detect plagiarism
    print_results(plagiarism_results)  # Print the results
    save_results_to_csv(plagiarism_results)  # Save the results to a CSV file

if __name__ == "__main__":
    main()


Plagiarism Results:
doc1.txt and doc2.txt have a similarity score of 0.97
doc3.txt and doc4.txt have a similarity score of 0.94
