# Importing Libraries

In [75]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Preprocessing Data

In [76]:
def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = text.strip() # Remove whitespace
    tokens = word_tokenize(text) # Tokenize text (Breaks text in small peices or tokens)
    stop_words = set(stopwords.words('english')) # Remove stopwords (removes a,an,the,in, on,at,and,but type of words)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens) # Rejoin tokens into a single string

# Load Files

In [77]:
def load_files(directory):
    files = [doc for doc in os.listdir(directory) if doc.endswith('.txt')]
    contents = []
    for file in files:
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            content = f.read()
            processed_content = preprocess_text(content)
            contents.append(processed_content)
    return files, contents

# Computing cosine similarity

In [78]:
def compute_similarity(contents):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(contents)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    similarity_matrix = np.round(similarity_matrix * 100, 2) #Convert similarity score to percentage and round off to 2 digits
    return similarity_matrix

In [79]:
def generate_results(files, similarity_matrix):
    results = []
    num_files = len(files)
    for i in range(num_files):
        for j in range(i + 1, num_files):
            score = similarity_matrix[i, j]
            results.append((files[i], files[j], score))
    return pd.DataFrame(results, columns=['Document A', 'Document B', 'Similarity Score'])

# Checking similarity score for all text files in the directory

In [80]:
def main(directory):
    files, contents = load_files(directory)
    similarity_matrix = compute_similarity(contents)
    results_df = generate_results(files, similarity_matrix)
    return results_df

if __name__ == "__main__":
    directory = 'C:\\Mukta\\PlagarismChecker_project'  # Replace with your directory
    results = main(directory)
    print(results)

   Document A      Document B  Similarity Score
0    doc1.txt        doc2.txt             42.16
1    doc1.txt        doc3.txt             34.02
2    doc1.txt        doc4.txt             33.38
3    doc1.txt        doc5.txt             85.80
4    doc1.txt  What's new.txt              5.90
5    doc2.txt        doc3.txt             18.41
6    doc2.txt        doc4.txt             17.96
7    doc2.txt        doc5.txt             36.17
8    doc2.txt  What's new.txt              4.11
9    doc3.txt        doc4.txt              9.56
10   doc3.txt        doc5.txt             35.72
11   doc3.txt  What's new.txt              5.43
12   doc4.txt        doc5.txt             28.64
13   doc4.txt  What's new.txt              1.93
14   doc5.txt  What's new.txt              7.12
