In [2]:
import pandas as pd

# Load the training data
train_path = r'train.csv'
df = pd.read_csv(train_path)

# Quick peek at the data
print(df.columns)
print(df[["essay_id","full_text","score"]].head())

#Remove placeholder essays like "PROPER_NAME"
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]

# Reset index for safety
df.reset_index(drop=True, inplace=True)


Index(['essay_id', 'full_text', 'score'], dtype='object')
  essay_id                                          full_text  score
0  000d118  Many people have car where they live. The thin...      3
1  000fe60  I am a scientist at NASA that is discussing th...      3
2  001ab80  People always wish they had the same technolog...      4
3  001bdc0  We all heard about Venus, the planet without a...      4
4  002ba53  Dear, State Senator\r\n\r\nThis is a letter to...      3


# Use this to preview an essay of choice

In [3]:
print("Essay ID:", df.loc[0, 'essay_id'])
print("Essay Preview:\n", df.loc[0, 'full_text'][:500])


Essay ID: 000d118
Essay Preview:
 Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won'


## Method 1

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import heapq

In [5]:
# Load the training data
df = pd.read_csv(r'train.csv')
df = df[['essay_id', 'full_text', 'score']]
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]
# Remove comment to set cap of 1000 essays for testing.
#df = df.head(1000).reset_index(drop=True)

In [6]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['full_text'])  # Sparse matrix (num_essays x vocab_size)

In [7]:
similarity_matrix = cosine_similarity(X)

# Set diagonal (self-similarity) to 0 so we don't pick them
np.fill_diagonal(similarity_matrix, 0)

In [8]:
top_5_heap = []

for i in range(similarity_matrix.shape[0]):
    for j in range(i + 1, similarity_matrix.shape[0]):
        sim = similarity_matrix[i, j]
        if sim >= 0.999:  # Skip near-exact duplicates
            continue
        if len(top_5_heap) < 5:
            heapq.heappush(top_5_heap, (sim, i, j))
        else:
            heapq.heappushpop(top_5_heap, (sim, i, j))


In [9]:
top_5 = sorted(top_5_heap, key=lambda x: x[0], reverse=True)

for sim, i, j in top_5:
    print(f"\nEssay {df.iloc[i]['essay_id']} vs Essay {df.iloc[j]['essay_id']}")
    print(f"Similarity Score: {sim:.4f}")
    print("Essay 1 Preview:", df.iloc[i]['full_text'][:150].replace('\n', ' '))
    print("Essay 2 Preview:", df.iloc[j]['full_text'][:150].replace('\n', ' '))


Essay 29aa983 vs Essay 6d25307
Similarity Score: 0.9219
 the challege of expor of the articleenus with the technology that we haave
 Whould you send someone to explore venus with even if we don't have the technology necessary?The author of the

Essay 7f55753 vs Essay 9985008
Similarity Score: 0.8890
 The founding fathers etablished ss, not a place...
Essay 2 Preview: In the first source its explain what a Electoral College is which is not a place it's a process the finding fathers established it in the Constitution

Essay 4d0c575 vs Essay e026924
Similarity Score: 0.8873
Essay 1 Preview: Dear state senator i am writing to you because i would like to try a different way to select the president by using the popular. most states have a "w
 What are the chances in favor of keeping the electoral college or changing to election by popular vote for the president of the unite

Essay 7cdf8b2 vs Essay 84a1b1a
Similarity Score: 0.8864
 Today I am going to write about how electoral colleges work.