In [20]:
import pandas as pd

# Load the training data
train_path = r'train.csv'
df = pd.read_csv(train_path)

# Quick peek at the data
print(df.columns)
print(df[["essay_id","full_text","score"]].head())

#Remove placeholder essays like "PROPER_NAME"
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]

# Reset index for safety
df.reset_index(drop=True, inplace=True)


Index(['essay_id', 'full_text', 'score'], dtype='object')
  essay_id                                          full_text  score
0  000d118  Many people have car where they live. The thin...      3
1  000fe60  I am a scientist at NASA that is discussing th...      3
2  001ab80  People always wish they had the same technolog...      4
3  001bdc0  We all heard about Venus, the planet without a...      4
4  002ba53  Dear, State Senator\n\nThis is a letter to arg...      3


# Use this to preview an essay of choice

In [21]:
print("Essay ID:", df.loc[0, 'essay_id'])
print("Essay Preview:\n", df.loc[0, 'full_text'][:500])


Essay ID: 000d118
Essay Preview:
 Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won'


## Method 1

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import heapq

In [None]:
# Load the training data
df = pd.read_csv(r'train.csv')
df = df[['essay_id', 'full_text', 'score']]
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]
# Remove comment to set cap of 1000 essays for testing.
#df = df.head(1000).reset_index(drop=True)

In [24]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['full_text'])  # Sparse matrix (num_essays x vocab_size)

KeyboardInterrupt: 

In [None]:
similarity_matrix = cosine_similarity(X)

# Set diagonal (self-similarity) to 0 so we don't pick them
np.fill_diagonal(similarity_matrix, 0)

In [None]:
top_5_heap = []

for i in range(similarity_matrix.shape[0]):
    for j in range(i + 1, similarity_matrix.shape[0]):
        sim = similarity_matrix[i, j]
        if sim >= 0.999:  # Skip near-exact duplicates
            continue
        if len(top_5_heap) < 5:
            heapq.heappush(top_5_heap, (sim, i, j))
        else:
            heapq.heappushpop(top_5_heap, (sim, i, j))


In [None]:
top_5 = sorted(top_5_heap, key=lambda x: x[0], reverse=True)

for sim, i, j in top_5:
    print(f"\nEssay {df.iloc[i]['essay_id']} vs Essay {df.iloc[j]['essay_id']}")
    print(f"Similarity Score: {sim:.4f}")
    print("Essay 1 Preview:", df.iloc[i]['full_text'][:150].replace('\n', ' '))
    print("Essay 2 Preview:", df.iloc[j]['full_text'][:150].replace('\n', ' '))


Essay 077377d vs Essay 0c8f97b
Similarity Score: 0.8253
Essay 1 Preview: The Electoral College is a process, not a place. The meaning of Electoral College is stated in Source 1: What is Electoral College and in paragraph 2.
Essay 2 Preview: The Electoral College is not a place, but a process. It was established by the founding fathers in the Constitution. The college consists of electors,

Essay 07a14a5 vs Essay 0fbd5ba
Similarity Score: 0.8211
Essay 1 Preview: The author's claim of studying Venus is a worthy pursuit because Venus is closely related to Earth, Venus has a enviroment that is similar to Earth, a
Essay 2 Preview: Many look up into the sky and see Venus, one of the brightest points in the night sky. While it looks like a star, it is actually a planet. In fact, i

Essay 0425a6e vs Essay 098d10a
Similarity Score: 0.8076
Essay 1 Preview: In this article the author is talking about going to Venus. The author explains the reasons why we should explore this planet. The author me