In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
from joblib import Parallel, delayed
import time
from tqdm import tqdm
from IPython.display import display, Markdown


In [None]:
df = pd.read_csv("nlp_project_train.csv")
df = df[['essay_id', 'full_text', 'score']]
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)].reset_index(drop=True)
df = df.head(1500)  # adjust as needed


In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['full_text'])


In [4]:
def simple_word_overlap(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    return len(words1 & words2) / len(words1 | words2) if words1 and words2 else 0.0

def compare_pair(i, j):
    cosine_score = cosine_similarity(X[i], X[j])[0][0]
    if cosine_score >= 0.999:
        return None
    overlap_score = simple_word_overlap(df.iloc[i]['full_text'], df.iloc[j]['full_text'])
    return (cosine_score, overlap_score, i, j)


In [5]:
all_pairs = []
interval = 50000
start_time = time.time()
pairs = list(combinations(range(X.shape[0]), 2))

results = Parallel(n_jobs=-1, prefer='processes')(
    delayed(compare_pair)(i, j) for i, j in tqdm(pairs, desc="Comparing Pairs")
)

for count, result in enumerate(results):
    if result:
        all_pairs.append(result)
    if (count + 1) % interval == 0:
        elapsed = time.time() - start_time
        mins, secs = divmod(elapsed, 60)
        print(f"[Progress] Compared {count + 1:,} pairs — {int(mins)}m {int(secs)}s", flush=True)

end_time = time.time()
total_time = end_time - start_time
total_comparisons = len(pairs)
comparisons_per_sec = total_comparisons / total_time
avg_time_per_pair = total_time / total_comparisons

print("\n=== Comparison Summary ===")
print(f"Total Comparisons: {total_comparisons:,}")
print(f"Total Time: {total_time:.2f} seconds")
print(f"Comparisons per Second: {comparisons_per_sec:,.2f}")
print(f"Average Time per Comparison: {avg_time_per_pair * 1000:.4f} ms")


Comparing Pairs:   0%|          | 4592/12497500 [01:20<60:30:54, 57.35it/s]

KeyboardInterrupt: 

In [None]:
top_5 = sorted(all_pairs, key=lambda x: x[0], reverse=True)[:5]
false_pos = sorted(all_pairs, key=lambda x: x[0] - x[1], reverse=True)[:2]
false_neg = sorted(all_pairs, key=lambda x: x[1] - x[0], reverse=True)[:2]


In [None]:
def display_result(title, pairs, df, label1='Cosine', label2='Overlap'):
    display(Markdown(f"## {title}"))
    for entry in pairs:
        if len(entry) != 4:
            display(Markdown("⚠️ Skipping entry due to incorrect format."))
            continue
        sim, overlap, i, j = entry
        try:
            display(Markdown(f"**Essay {df.iloc[i]['essay_id']} vs {df.iloc[j]['essay_id']}**"))
            display(Markdown(f"- {label1}: `{sim:.4f}`"))
            display(Markdown(f"- {label2}: `{overlap:.4f}`"))
            display(Markdown(f"**Essay 1 Preview**: {df.iloc[i]['full_text'][:150].replace(chr(10), ' ')}"))
            display(Markdown(f"**Essay 2 Preview**: {df.iloc[j]['full_text'][:150].replace(chr(10), ' ')}"))
            display(Markdown("---"))
        except Exception as e:
            display(Markdown(f"⚠️ Error displaying pair: {e}"))


In [None]:
display_result("Top 5 Most Similar Essays (TF-IDF)", top_5, df)
display_result("False Positives (High Cosine, Low Overlap)", false_pos, df)
display_result("False Negatives (High Overlap, Low Cosine)", false_neg, df, label1="Overlap", label2="Cosine")


## Top 5 Most Similar Essays (TF-IDF)

**Essay 0bf39e6 vs 175419b**

- Cosine: `0.7739`

- Overlap: `0.3221`

**Essay 1 Preview**: The electoral college is a process established in the constitution as a compromise between election of the president by a popular vote of qualified ci

**Essay 2 Preview**: Today, there are many arguments between keeping our governments way of electing a new president. Many people have turned their back on the Electoral C

---

**Essay 051c46a vs 0870f09**

- Cosine: `0.7421`

- Overlap: `0.2353`

**Essay 1 Preview**: Dear senator,  I have done research and I would like to change the Electoral College to the popular vote for the President of the United States. Sou

**Essay 2 Preview**: Changing the election by popluar vote for the President of the United States would be a better option than keeping the Electoral College. Although the

---

**Essay 077377d vs 0c8f97b**

- Cosine: `0.7337`

- Overlap: `0.2197`

**Essay 1 Preview**: The Electoral College is a process, not a place. The meaning of Electoral College is stated in Source 1: What is Electoral College and in paragraph 2.

**Essay 2 Preview**: The Electoral College is not a place, but a process. It was established by the founding fathers in the Constitution. The college consists of electors,

---

**Essay 02d481d vs 0f81127**

- Cosine: `0.7311`

- Overlap: `0.2057`

**Essay 1 Preview**: Dear, Senator  The  Electoral college has been around for centuries and as time changes, things start to evolve and grow along with the time perio

**Essay 2 Preview**: It is often said that "change is good." This saying is one way to describe my feelings towards the Electoral College. The sources "What Is the Elector

---

**Essay 0c8f97b vs 175419b**

- Cosine: `0.7233`

- Overlap: `0.2521`

**Essay 1 Preview**: The Electoral College is not a place, but a process. It was established by the founding fathers in the Constitution. The college consists of electors,

**Essay 2 Preview**: Today, there are many arguments between keeping our governments way of electing a new president. Many people have turned their back on the Electoral C

---

## False Positives (High Cosine, Low Overlap)

**Essay 111b844 vs 11bc37a**

- Cosine: `0.6801`

- Overlap: `0.1081`

**Essay 1 Preview**: More companies are pouring in more and more money trying to make the perfect driveless cars but pursuing the driveless car is a negative thing for car

**Essay 2 Preview**: Driverless cars can have a positive aspect on society. Driveless cars can improve the risk of injuring someone . Also driveless cars can improve the r

---

**Essay 11bc37a vs 17a36e4**

- Cosine: `0.6763`

- Overlap: `0.1351`

**Essay 1 Preview**: Driverless cars can have a positive aspect on society. Driveless cars can improve the risk of injuring someone . Also driveless cars can improve the r

**Essay 2 Preview**: In the article "Driveless Cars Are Coming," the author presents both postivie and negative aspects of driveless cars. The author uses many good argume

---

## False Negatives (High Overlap, Low Cosine)

**Essay 0a7ef89 vs 1747469**

- Overlap: `0.0151`

- Cosine: `0.2244`

**Essay 1 Preview**: Being a Seagoing Cowboy sounds like fun. You should be one because you would be able to go places you've never been to before, see things you've never

**Essay 2 Preview**: The electoral college been around for a long time. Some of the states don't like it, because most states doesn't even see the campaign. Sometimes peop

---

**Essay 12a4ba5 vs 1534fbf**

- Overlap: `0.0134`

- Cosine: `0.2083`

**Essay 1 Preview**: this is my stance for am i agaist or for using this tecnology to read students' emotional expressions. my anserw is yes we should supoort it. for futh

**Essay 2 Preview**: Driverless cars are and should be the future. We wouldn't have to worry about that one guy driving red lights and aciddently smashing into another per

---