In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
from joblib import Parallel, delayed
import time
from tqdm import tqdm
from IPython.display import display, Markdown


In [2]:
df = pd.read_csv("nlp_project_train.csv")
df = df[['essay_id', 'full_text', 'score']]
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)].reset_index(drop=True)
df = df.head(1000)  # adjust as needed


In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['full_text'])


In [4]:
def simple_word_overlap(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    return len(words1 & words2) / len(words1 | words2) if words1 and words2 else 0.0

def compare_pair(i, j):
    cosine_score = cosine_similarity(X[i], X[j])[0][0]
    if cosine_score >= 0.999:
        return None
    overlap_score = simple_word_overlap(df.iloc[i]['full_text'], df.iloc[j]['full_text'])
    return (cosine_score, overlap_score, i, j)


In [5]:
all_pairs = []
interval = 50000
start_time = time.time()
pairs = list(combinations(range(X.shape[0]), 2))

results = Parallel(n_jobs=-1, prefer='processes')(
    delayed(compare_pair)(i, j) for i, j in tqdm(pairs, desc="Comparing Pairs")
)

for count, result in enumerate(results):
    if result:
        all_pairs.append(result)
    if (count + 1) % interval == 0:
        elapsed = time.time() - start_time
        mins, secs = divmod(elapsed, 60)
        print(f"[Progress] Compared {count + 1:,} pairs — {int(mins)}m {int(secs)}s", flush=True)

end_time = time.time()
total_time = end_time - start_time
total_comparisons = len(pairs)
comparisons_per_sec = total_comparisons / total_time
avg_time_per_pair = total_time / total_comparisons

print("\n=== Comparison Summary ===")
print(f"Total Comparisons: {total_comparisons:,}")
print(f"Total Time: {total_time:.2f} seconds")
print(f"Comparisons per Second: {comparisons_per_sec:,.2f}")
print(f"Average Time per Comparison: {avg_time_per_pair * 1000:.4f} ms")


Comparing Pairs: 100%|██████████| 499500/499500 [00:41<00:00, 12051.08it/s]


[Progress] Compared 50,000 pairs — 0m 42s
[Progress] Compared 100,000 pairs — 0m 42s
[Progress] Compared 150,000 pairs — 0m 42s
[Progress] Compared 200,000 pairs — 0m 42s
[Progress] Compared 250,000 pairs — 0m 42s
[Progress] Compared 300,000 pairs — 0m 42s
[Progress] Compared 350,000 pairs — 0m 42s
[Progress] Compared 400,000 pairs — 0m 42s
[Progress] Compared 450,000 pairs — 0m 42s

=== Comparison Summary ===
Total Comparisons: 499,500
Total Time: 42.63 seconds
Comparisons per Second: 11,718.13
Average Time per Comparison: 0.0853 ms


In [6]:
top_5 = sorted(all_pairs, key=lambda x: x[0], reverse=True)[:5]
false_pos = sorted(all_pairs, key=lambda x: x[0] - x[1], reverse=True)[:2]
false_neg = sorted(all_pairs, key=lambda x: x[1] - x[0], reverse=True)[:2]


In [7]:
def display_result(title, pairs, df, label1='Cosine', label2='Overlap'):
    display(Markdown(f"## {title}"))
    for entry in pairs:
        if len(entry) != 4:
            display(Markdown("Skipping entry due to incorrect format."))
            continue
        sim, overlap, i, j = entry
        try:
            display(Markdown(f"**Essay {df.iloc[i]['essay_id']} vs {df.iloc[j]['essay_id']}**"))
            display(Markdown(f"- {label1}: `{sim:.4f}`"))
            display(Markdown(f"- {label2}: `{overlap:.4f}`"))
            display(Markdown(f"**Essay 1 Preview**: {df.iloc[i]['full_text'][:150].replace(chr(10), ' ')}"))
            display(Markdown(f"**Essay 2 Preview**: {df.iloc[j]['full_text'][:150].replace(chr(10), ' ')}"))
            display(Markdown("---"))
        except Exception as e:
            display(Markdown(f"Error displaying pair: {e}"))


In [8]:
display_result("Top 5 Most Similar Essays (TF-IDF)", top_5, df)
display_result("False Positives (High Cosine, Low Overlap)", false_pos, df)
display_result("False Negatives (High Overlap, Low Cosine)", false_neg, df, label1="Overlap", label2="Cosine")


## Top 5 Most Similar Essays (TF-IDF)

**Essay 051c46a vs 0870f09**

- Cosine: `0.7412`

- Overlap: `0.2353`

**Essay 1 Preview**: Dear senator,  I have done research and I would like to change the Electoral College to the popular vote for the President of the United States. Sou

**Essay 2 Preview**: Changing the election by popluar vote for the President of the United States would be a better option than keeping the Electoral College. Although the

---

**Essay 077377d vs 0c8f97b**

- Cosine: `0.7346`

- Overlap: `0.2197`

**Essay 1 Preview**: The Electoral College is a process, not a place. The meaning of Electoral College is stated in Source 1: What is Electoral College and in paragraph 2.

**Essay 2 Preview**: The Electoral College is not a place, but a process. It was established by the founding fathers in the Constitution. The college consists of electors,

---

**Essay 02d481d vs 0f81127**

- Cosine: `0.7324`

- Overlap: `0.2057`

**Essay 1 Preview**: Dear, Senator  The  Electoral college has been around for centuries and as time changes, things start to evolve and grow along with the time perio

**Essay 2 Preview**: It is often said that "change is good." This saying is one way to describe my feelings towards the Electoral College. The sources "What Is the Elector

---

**Essay 0bf39e6 vs 0c8f97b**

- Cosine: `0.7185`

- Overlap: `0.2768`

**Essay 1 Preview**: The electoral college is a process established in the constitution as a compromise between election of the president by a popular vote of qualified ci

**Essay 2 Preview**: The Electoral College is not a place, but a process. It was established by the founding fathers in the Constitution. The college consists of electors,

---

**Essay 0c61969 vs 0f7dda8**

- Cosine: `0.7043`

- Overlap: `0.2570`

**Essay 1 Preview**: I dissagree with the fact that the state wants to keep favor of electoral college. I think they should change the way the whole election is set up by 

**Essay 2 Preview**: I think that we dont need to keep the Electoral College because it is not a good way for people to vote.  I think how we should vote is by a majorit

---

## False Positives (High Cosine, Low Overlap)

**Essay 02d481d vs 0f81127**

- Cosine: `0.7324`

- Overlap: `0.2057`

**Essay 1 Preview**: Dear, Senator  The  Electoral college has been around for centuries and as time changes, things start to evolve and grow along with the time perio

**Essay 2 Preview**: It is often said that "change is good." This saying is one way to describe my feelings towards the Electoral College. The sources "What Is the Elector

---

**Essay 077377d vs 0c8f97b**

- Cosine: `0.7346`

- Overlap: `0.2197`

**Essay 1 Preview**: The Electoral College is a process, not a place. The meaning of Electoral College is stated in Source 1: What is Electoral College and in paragraph 2.

**Essay 2 Preview**: The Electoral College is not a place, but a process. It was established by the founding fathers in the Constitution. The college consists of electors,

---

## False Negatives (High Overlap, Low Cosine)

**Essay 04f1fc3 vs 09b7da8**

- Overlap: `0.0227`

- Cosine: `0.2166`

**Essay 1 Preview**: First,If i was a scientist at NASA talking to others about them truly believing the Face wasn't created by aliens. I would tell them aliens are not re

**Essay 2 Preview**: No they should not use the technology in class rooms because thats a invashion of pricicy. And they dont need to know every little thing going on in o

---

**Essay 05508b4 vs 0fd322d**

- Overlap: `0.0322`

- Cosine: `0.2217`

**Essay 1 Preview**: The author is very convinceing he didnt just say whats good nor did he say all what is bad.  From the text he made the evidce of how Venus has made 

**Essay 2 Preview**: So you believe that the face on Mars was alien made. There are very many people that belive it is alien made to. There are also many people that belie

---