In [2]:
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to remove comments and import statements from code
def preprocess_code(code):
    # Remove comments (Python-style: '#' and multiline)
    code = re.sub(r'\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)  # Multiline comments
    code = re.sub(r'\"\"\".*?\"\"\"', '', code, flags=re.DOTALL)  # Multiline comments
    code = re.sub(r'#.*', '', code)  # Single-line comments
    
    # Remove import statements
    code = re.sub(r'^\s*(import|from)\s+[^\n]+', '', code, flags=re.MULTILINE)
    
    return code

# Collecting all code files in the current directory (assuming they are Python files)
student_files = [doc for doc in os.listdir('code_data') if doc.endswith('.txt')]
student_code = [preprocess_code(open(os.path.join('code_data', _file), encoding='utf-8', errors='ignore').read())
                for _file in student_files]

# Function to vectorize the code using TF-IDF
def vectorize(Text): 
    return TfidfVectorizer(token_pattern=r'\b\w+\b').fit_transform(Text).toarray()

# Function to calculate cosine similarity between two vectors
def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])

# Vectorizing the code files
vectors = vectorize(student_code)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()

# Function to check for plagiarism with a threshold
def check_plagiarism():
    for student_a, text_vector_a in s_vectors:
        for student_b, text_vector_b in s_vectors:
            if student_a != student_b:
                sim_score = similarity(text_vector_a, text_vector_b)[0][1]
                if sim_score > 0.5:  # Only consider files with similarity score > 0.5
                    student_pair = sorted((student_a, student_b))
                    score = (student_pair[0], student_pair[1], sim_score)
                    plagiarism_results.add(score)
    return plagiarism_results

# Checking plagiarism and printing the results (only scores > 0.5)
for data in check_plagiarism():
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')


bottomup.txt vs helper functions.txt: Similarity Score: 0.67
Basic merge implementation.txt vs helper functions.txt: Similarity Score: 0.57


In [6]:
preprocess_code('''function mergeSort(arr) {
    if (arr.length <= 1) {
        return arr;
    }

    const mid = Math.floor(arr.length / 2);
    const left = mergeSort(arr.slice(0, mid));
    const right = mergeSort(arr.slice(mid));

    return merge(left, right);
}

function merge(left, right) {
    let result = [];
    let leftIndex = 0;
    let rightIndex = 0;

    while (leftIndex < left.length && rightIndex < right.length) {
        if (left[leftIndex] < right[rightIndex]) {
            result.push(left[leftIndex]);
            leftIndex++;
        } else {
            result.push(right[rightIndex]);
            rightIndex++;
        }
    }

    return result.concat(left.slice(leftIndex)).concat(right.slice(rightIndex));
}

// Example usage
console.log(mergeSort([34, 7, 23, 32, 5, 62]));
''')


'function mergeSort(arr) {\n    if (arr.length <= 1) {\n        return arr;\n    }\n\n    const mid = Math.floor(arr.length / 2);\n    const left = mergeSort(arr.slice(0, mid));\n    const right = mergeSort(arr.slice(mid));\n\n    return merge(left, right);\n}\n\nfunction merge(left, right) {\n    let result = [];\n    let leftIndex = 0;\n    let rightIndex = 0;\n\n    while (leftIndex < left.length && rightIndex < right.length) {\n        if (left[leftIndex] < right[rightIndex]) {\n            result.push(left[leftIndex]);\n            leftIndex++;\n        } else {\n            result.push(right[rightIndex]);\n            rightIndex++;\n        }\n    }\n\n    return result.concat(left.slice(leftIndex)).concat(right.slice(rightIndex));\n}\n\n// Example usage\nconsole.log(mergeSort([34, 7, 23, 32, 5, 62]));\n'