In [7]:
import pandas as pd

# Load the training data
train_path = r'nlp_project_train.csv'
df = pd.read_csv(train_path)

# Quick peek at the data
print(df.columns)
print(df[["essay_id","full_text","score"]].head())

#Remove placeholder essays like "PROPER_NAME"
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]

# Reset index for safety
df.reset_index(drop=True, inplace=True)


Index(['essay_id', 'full_text', 'score'], dtype='object')
  essay_id                                          full_text  score
0  000d118  Many people have car where they live. The thin...      3
1  000fe60  I am a scientist at NASA that is discussing th...      3
2  001ab80  People always wish they had the same technolog...      4
3  001bdc0  We all heard about Venus, the planet without a...      4
4  002ba53  Dear, State Senator\n\nThis is a letter to arg...      3


# Use this to preview an essay of choice

In [8]:
print("Essay ID:", df.loc[0, 'essay_id'])
print("Essay Preview:\n", df.loc[0, 'full_text'][:500])


Essay ID: 000d118
Essay Preview:
 Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won'


## Method 1

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import heapq

In [16]:
# Load the training data
df = pd.read_csv(r'nlp_project_train.csv')
df = df[['essay_id', 'full_text', 'score']]
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]
# Remove comment to set cap of 1000 essays for testing.
#df = df.head(1000).reset_index(drop=True)

In [11]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['full_text'])  # Sparse matrix (num_essays x vocab_size)

In [12]:
similarity_matrix = cosine_similarity(X)

# Set diagonal (self-similarity) to 0 so we don't pick them
np.fill_diagonal(similarity_matrix, 0)

In [13]:
top_5_heap = []

for i in range(similarity_matrix.shape[0]):
    for j in range(i + 1, similarity_matrix.shape[0]):
        sim = similarity_matrix[i, j]
        if sim >= 0.999:  # Skip near-exact duplicates
            continue
        if len(top_5_heap) < 5:
            heapq.heappush(top_5_heap, (sim, i, j))
        else:
            heapq.heappushpop(top_5_heap, (sim, i, j))


In [14]:
top_5 = sorted(top_5_heap, key=lambda x: x[0], reverse=True)

for sim, i, j in top_5:
    print(f"\nEssay {df.iloc[i]['essay_id']} vs Essay {df.iloc[j]['essay_id']}")
    print(f"Similarity Score: {sim:.4f}")
    print("Essay 1 Preview:", df.iloc[i]['full_text'][:150].replace('\n', ' '))
    print("Essay 2 Preview:", df.iloc[j]['full_text'][:150].replace('\n', ' '))


Essay 29aa983 vs Essay 6d25307
Similarity Score: 0.9219
Essay 1 Preview: A new hom  whould you send someone to explore venus with the technology that we haave  right now? the author of the article  the challege of exploring
Essay 2 Preview: Benefits of Researching a New planet  Whould you send someone to explore venus with even if we don't have the technology necessary?The author of the a

Essay 7f55753 vs Essay 9985008
Similarity Score: 0.8890
Essay 1 Preview: teacher asks:  What is an electoral college?  my answer:  The electoral college is a process, not a place...  The founding fathers etablished it in th
Essay 2 Preview: In the first source its explain what a Electoral College is which is not a place it's a process the finding fathers established it in the Constitution

Essay 4d0c575 vs Essay e026924
Similarity Score: 0.8873
Essay 1 Preview: Dear state senator i am writing to you because i would like to try a different way to select the president by using the popular. most states

### This part is used so that I can easily get a full essay to put it in an online comparer. 

In [15]:
essay_id = '6d25307'  # <- target essay ID

# Find the matching row
row = df[df['essay_id'] == essay_id]

# Display full essay text
if not row.empty:
    print("Essay ID:", essay_id)
    print("\nFull Essay Text:\n")
    print(row.iloc[0]['full_text'])
else:
    print(f"No essay found with ID {essay_id}")


Essay ID: 6d25307

Full Essay Text:

Benefits of Researching a New planet

Whould you send someone to explore venus with even if we don't have the technology necessary?The author of the article "The Challenge of Exploring Venus" has made some good point about why we should send if not but machines to explore Venus.The author believes that it would be beneficial for us to explor Venus using the technology that we have right now and the new technology that it has recently been invented for example:The mechamical computers,The new Idea that NASA has come up to and considering Venus as a moving planet.

First of all the author explains the old use of old technology called mechanical computer.This computers played an important role during the 1940s Word War two by making calculations using gears and levers, no electronics were needed for this computers. The author explains why the computer could be very useful to explore Venus by saying that this computer are very powerful,flexible, and qui

In [None]:
# This is a model made by ChatGPT to test the similarity of
# two essays provided. This is not meant to be used to find the 
# actually score for the methods needed for the project. Just a secondary
# testing method to check my scores.
from sentence_transformers import SentenceTransformer, util
import pandas as pd

# Load your CSV
df = pd.read_csv(r'nlp_project_train.csv')

# Load the transformer model once (globally)
model = SentenceTransformer('all-MiniLM-L6-v2')

def compare_essays_by_id(id1, id2, df):
    # Lookup essays by ID
    essay1_row = df[df["Essay_ID"] == id1]
    essay2_row = df[df["Essay_ID"] == id2]

    if essay1_row.empty or essay2_row.empty:
        return f"One or both Essay IDs not found: {id1}, {id2}"

    essay1 = essay1_row["Essay_Text"].values[0]
    essay2 = essay2_row["Essay_Text"].values[0]

    # === SentenceTransformer similarity (THIS IS WHERE IT GOES) ===
    embeddings = model.encode([essay1, essay2])
    score = util.cos_sim(embeddings[0], embeddings[1]).item()

    # Extract first sentences
    first_sentence1 = essay1.strip().split('.')[0]
    first_sentence2 = essay2.strip().split('.')[0]

    # Output
    print(f"Essay {id1} vs Essay {id2}")
    print(f"Similarity Score: {score:.4f}")
    print(f"Essay 1: {first_sentence1}")
    print(f"Essay 2: {first_sentence2}")

compare_essays_by_id("29aa983", "6d25307", df)


^C
Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'sentence_transformers'