In [17]:
import pandas as pd

# Load the training data
train_path = r'nlp_project_train.csv'
df = pd.read_csv(train_path)

# Quick peek at the data
print(df.columns)
print(df[["essay_id","full_text","score"]].head())

#Remove placeholder essays like "PROPER_NAME"
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]

# Reset index for safety
df.reset_index(drop=True, inplace=True)


Index(['essay_id', 'full_text', 'score'], dtype='object')
  essay_id                                          full_text  score
0  000d118  Many people have car where they live. The thin...      3
1  000fe60  I am a scientist at NASA that is discussing th...      3
2  001ab80  People always wish they had the same technolog...      4
3  001bdc0  We all heard about Venus, the planet without a...      4
4  002ba53  Dear, State Senator\r\n\r\nThis is a letter to...      3


# Use this to preview an essay of choice

In [18]:
print("Essay ID:", df.loc[0, 'essay_id'])
print("Essay Preview:\n", df.loc[0, 'full_text'][:500])


Essay ID: 000d118
Essay Preview:
 Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won'


## Method 1

In [29]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time
from IPython.display import display, Markdown
from itertools import combinations
from joblib import Parallel, delayed
from tqdm import tqdm

In [30]:
# Load and filter the first 1000 essays
df = pd.read_csv("nlp_project_train.csv")
df = df[['essay_id', 'full_text', 'score']]
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)].reset_index(drop=True)
df = df.head(1200)


In [31]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['full_text'])


In [32]:
def simple_word_overlap(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    return len(words1 & words2) / len(words1 | words2) if words1 and words2 else 0.0

In [33]:
def compare_pair(i, j):
    cosine_score = cosine_similarity(X[i], X[j])[0][0]
    if cosine_score >= 0.999:
        return None
    overlap_score = simple_word_overlap(df.iloc[i]['full_text'], df.iloc[j]['full_text'])
    return (cosine_score, overlap_score, i, j)

In [34]:
from tqdm import tqdm

all_pairs = []
interval = 50000
start_time = time.time()

# Generate all unique pairs of essay indices
pairs = list(combinations(range(X.shape[0]), 2))

# Run comparisons in parallel with real-time progress
results = Parallel(n_jobs=-1, prefer='processes')(
    delayed(compare_pair)(i, j) for i, j in tqdm(pairs, desc="Comparing Pairs")
)

# Process results and track inline progress
for count, result in enumerate(results):
    if result:
        all_pairs.append(result)
    if (count + 1) % interval == 0:
        elapsed = time.time() - start_time
        mins, secs = divmod(elapsed, 60)
        print(f"[Progress] Compared {count + 1:,} pairs — {int(mins)}m {int(secs)}s", flush=True)

# Post-process timing summary
end_time = time.time()
total_time = end_time - start_time
total_comparisons = len(pairs)
comparisons_per_sec = total_comparisons / total_time
avg_time_per_pair = total_time / total_comparisons

# Display summary
print("\n=== Comparison Summary ===")
print(f"Total Comparisons: {total_comparisons:,}")
print(f"Total Time: {total_time:.2f} seconds")
print(f"Comparisons per Second: {comparisons_per_sec:,.2f}")
print(f"Average Time per Comparison: {avg_time_per_pair * 1000:.4f} ms")



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[Progress] Compared 50,000 pairs — 1m 40s
[Progress] Compared 100,000 pairs — 1m 40s
[Progress] Compared 150,000 pairs — 1m 40s
[Progress] Compared 200,000 pairs — 1m 40s
[Progress] Compared 250,000 pairs — 1m 40s
[Progress] Compared 300,000 pairs — 1m 40s
[Progress] Compared 350,000 pairs — 1m 40s
[Progress] Compared 400,000 pairs — 1m 40s
[Progress] Compared 450,000 pairs — 1m 40s
[Progress] Compared 500,000 pairs — 1m 40s
[Progress] Compared 550,000 pairs — 1m 40s
[Progress] Compared 600,000 pairs — 1m 40s
[Progress] Compared 650,000 pairs — 1m 40s
[Progress] Compared 700,000 pairs — 1m 40s

=== Comparison Summary ===
Total Comparisons: 719,400
Total Time: 100.47 seconds
Comparisons per Second: 7,160.00
Average Time per Comparison: 0.1397 ms


In [35]:
# Rank and extract
top_5 = sorted(all_pairs, key=lambda x: x[0], reverse=True)[:5]
false_pos = sorted(all_pairs, key=lambda x: x[0] - x[1], reverse=True)[:2]
false_neg = sorted(all_pairs, key=lambda x: x[1] - x[0], reverse=True)[:2]


In [None]:
def display_result(title, pairs, df, label1='Cosine', label2='Overlap'):
    display(Markdown(f"## {title}"))
    for entry in pairs:
        if len(entry) != 4:
            display(Markdown("Skipping entry due to incorrect format."))
            continue
        sim, overlap, i, j = entry
        try:
            display(Markdown(f"**Essay {df.iloc[i]['essay_id']} vs {df.iloc[j]['essay_id']}**"))
            display(Markdown(f"- {label1}: `{sim:.4f}`"))
            display(Markdown(f"- {label2}: `{overlap:.4f}`"))
            display(Markdown(f"**Essay 1 Preview**: {df.iloc[i]['full_text'][:150].replace(chr(10), ' ')}"))
            display(Markdown(f"**Essay 2 Preview**: {df.iloc[j]['full_text'][:150].replace(chr(10), ' ')}"))
            display(Markdown("---"))
        except Exception as e:
            display(Markdown(f"Error displaying pair: {e}"))


In [37]:
display_result("Top 5 Most Similar Essays", top_5, df)
display_result("False Positives (High Cosine, Low Overlap)", false_pos, df)
display_result("False Negatives (High Overlap, Low Cosine)", false_neg, df, label1='Overlap', label2='Cosine')


## Top 5 Most Similar Essays

**Essay 077377d vs 0c8f97b**

- Cosine: `0.8253`

- Overlap: `0.2197`

**Essay 1 Preview**: The Electoral College is a process, not a place. The meaning of Electoral College is stated in Source 1: What is Electoral College and in paragraph 2.

**Essay 2 Preview**: The Electoral College is not a place, but a process. It was established by the founding fathers in the Constitution. The college consists of electors,

---

**Essay 07a14a5 vs 0fbd5ba**

- Cosine: `0.8211`

- Overlap: `0.2388`

**Essay 1 Preview**: The author's claim of studying Venus is a worthy pursuit because Venus is closely related to Earth, Venus has a enviroment that is similar to Earth, a

**Essay 2 Preview**: Many look up into the sky and see Venus, one of the brightest points in the night sky. While it looks like a star, it is actually a planet. In fact, i

---

**Essay 0fb0f34 vs 10bc1b7**

- Cosine: `0.8124`

- Overlap: `0.4055`

**Essay 1 Preview**: This story is talking about venus in the challenge trying to exploring it its like its hard trying to exploring it because it got all these things wro

**Essay 2 Preview**: The challenge of Exploring Venus. Venus is the most like planet ou there Venus sometimes called the "Evening Star," is one of the brightest pointes of

---

**Essay 0425a6e vs 098d10a**

- Cosine: `0.8076`

- Overlap: `0.2267`

**Essay 1 Preview**: In this article the author is talking about going to Venus. The author explains the reasons why we should explore this planet. The author mentions tha

**Essay 2 Preview**: The planet Venus may be one of our most valuable assets to understand our planet, but Venus has some dangers we cannot ignore. In "The Challenge of Ex

---

**Essay 051c46a vs 0870f09**

- Cosine: `0.8044`

- Overlap: `0.2353`

**Essay 1 Preview**: Dear senator,  I have done research and I would like to change the Electoral College to the popular vote for the President of the United States. Sou

**Essay 2 Preview**: Changing the election by popluar vote for the President of the United States would be a better option than keeping the Electoral College. Although the

---

## False Positives (High Cosine, Low Overlap)

**Essay 05e1eae vs 065c240**

- Cosine: `0.7530`

- Overlap: `0.1121`

**Essay 1 Preview**: The Venus are not safe, there are the reasion why the NASA are challenging the Venus. The author idea was that studing the planet Venus are to know ev

**Essay 2 Preview**: Outer space is a mystery within itself, but the addition of the planets and stars are just an expansion to those mysteries. The article "The Challenge

---

**Essay 05e1eae vs 0fbd5ba**

- Cosine: `0.7237`

- Overlap: `0.1102`

**Essay 1 Preview**: The Venus are not safe, there are the reasion why the NASA are challenging the Venus. The author idea was that studing the planet Venus are to know ev

**Essay 2 Preview**: Many look up into the sky and see Venus, one of the brightest points in the night sky. While it looks like a star, it is actually a planet. In fact, i

---

## False Negatives (High Overlap, Low Cosine)

**Essay 00aa6de vs 068ad55**

- Overlap: `0.0322`

- Cosine: `0.2051`

**Essay 1 Preview**: This system could be very benificial in classrooms for many reasons. One reason being this system can literally tell what all of your emotions are an 

**Essay 2 Preview**: Changing to election by popular vote for the president is a wonderful idea. By doing this you persuade more people to go out and vote. The people of t

---

**Essay 0b095ea vs 10642eb**

- Overlap: `0.0136`

- Cosine: `0.1847`

**Essay 1 Preview**: He supports his idea with a lot of facts. he also uses a lot of deytails whitch is good because we get a lil bit more of an idea. i get what the athou

**Essay 2 Preview**: I know this wasn't created by aliens in many ways. It is just a naturally created landform that looks like face. If it was created by aliens then NASA

---

### The part below is a way of testing the Cosine similarity and the actual amount of word overlap between the top 5 pairs of essays. This helps give a look to see if the model is actually working well or not.

In [38]:
#This model was inspired by chatGPT and online resources to test the accuracy of the
# word by word model above. I will use this again later to continue
#Testing further models following the same parameters.
'''
def simple_word_overlap(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    if not words1 or not words2:
        return 0.0
    return len(words1 & words2) / len(words1 | words2)

threshold_cosine = 0.85
threshold_overlap = 0.3

for sim, i, j in top_5:
    essay1 = df.iloc[i]['full_text']
    essay2 = df.iloc[j]['full_text']
    overlap_score = simple_word_overlap(essay1, essay2)

    print(f"\nEssay {df.iloc[i]['essay_id']} vs Essay {df.iloc[j]['essay_id']}")
    print(f"Cosine Similarity: {sim:.4f}")
    print(f"Word Overlap Score: {overlap_score:.4f}")

    if sim >= threshold_cosine and overlap_score < threshold_overlap:
        print("Flagged as a potential FALSE POSITIVE (high cosine, low overlap)")
'''

'\ndef simple_word_overlap(text1, text2):\n    words1 = set(text1.lower().split())\n    words2 = set(text2.lower().split())\n    if not words1 or not words2:\n        return 0.0\n    return len(words1 & words2) / len(words1 | words2)\n\nthreshold_cosine = 0.85\nthreshold_overlap = 0.3\n\nfor sim, i, j in top_5:\n    essay1 = df.iloc[i][\'full_text\']\n    essay2 = df.iloc[j][\'full_text\']\n    overlap_score = simple_word_overlap(essay1, essay2)\n\n    print(f"\nEssay {df.iloc[i][\'essay_id\']} vs Essay {df.iloc[j][\'essay_id\']}")\n    print(f"Cosine Similarity: {sim:.4f}")\n    print(f"Word Overlap Score: {overlap_score:.4f}")\n\n    if sim >= threshold_cosine and overlap_score < threshold_overlap:\n        print("Flagged as a potential FALSE POSITIVE (high cosine, low overlap)")\n'