In [1]:
!pip install -U google-cloud-storage




In [2]:
from google.colab import auth
auth.authenticate_user()


In [7]:
from google.colab import auth
from google.cloud import storage

auth.authenticate_user()

project_id = 'essay-scoring-455902'
bucket_name = 'nlp-eassay-bucket-2025'

client = storage.Client(project=project_id)
bucket = client.get_bucket(bucket_name)

blob = bucket.blob('nlp_project_train.csv')
blob.download_to_filename('nlp_project_train.csv')  # This saves the file locally in Colab


In [8]:
import pandas as pd

# Load the training data
train_path = r'nlp_project_train.csv'
df = pd.read_csv(train_path)

# Quick peek at the data
print(df.columns)
print(df[["essay_id","full_text","score"]].head())

#Remove placeholder essays like "PROPER_NAME"
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]

# Reset index for safety
df.reset_index(drop=True, inplace=True)


Index(['essay_id', 'full_text', 'score'], dtype='object')
  essay_id                                          full_text  score
0  000d118  Many people have car where they live. The thin...      3
1  000fe60  I am a scientist at NASA that is discussing th...      3
2  001ab80  People always wish they had the same technolog...      4
3  001bdc0  We all heard about Venus, the planet without a...      4
4  002ba53  Dear, State Senator\r\n\r\nThis is a letter to...      3


# Use this to preview an essay of choice

In [9]:
print("Essay ID:", df.loc[0, 'essay_id'])
print("Essay Preview:\n", df.loc[0, 'full_text'][:500])


Essay ID: 000d118
Essay Preview:
 Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won'


## Method 1

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import heapq
import time

In [17]:
# Load the training data
df = pd.read_csv(r'nlp_project_train.csv').head(5000)
df = df[['essay_id', 'full_text', 'score']]
df = df[~df['full_text'].str.contains("PROPER_NAME", na=False)]


In [18]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['full_text'])  # Sparse matrix (num_essays x vocab_size)

In [19]:
similarity_matrix = cosine_similarity(X)

# Set diagonal (self-similarity) to 0 so we don't pick them
np.fill_diagonal(similarity_matrix, 0)

In [20]:
# Simple word overlap function
def simple_word_overlap(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    if not words1 or not words2:
        return 0.0
    return len(words1 & words2) / len(words1 | words2)

In [21]:
# Initialize heaps
top_5_heap = []
false_positive_heap = []
false_negative_heap = []
count = 0
interval = 1000
start_time = time.time()

# Pairwise comparison loop
for i in range(X.shape[0]):
    for j in range(i + 1, X.shape[0]):
        cosine_score = cosine_similarity(X[i], X[j])[0][0]
        count += 1

        if count % interval == 0:
            elapsed_time = time.time() - start_time
            mins, secs = divmod(elapsed_time, 60)
            print(f"Processed {count} pairs - elsaped time {mins}:{secs}")

        if cosine_score >= 0.999:
            continue  # Skip near-duplicates

        # Top 5 by cosine similarity
        if len(top_5_heap) < 5:
            heapq.heappush(top_5_heap, (cosine_score, i, j))
        else:
            heapq.heappushpop(top_5_heap, (cosine_score, i, j))

        # Real-time validation: compute overlap
        overlap_score = simple_word_overlap(df.iloc[i]['full_text'], df.iloc[j]['full_text'])

        # False Positive: high cosine, low overlap
        if cosine_score >= 0.85 and overlap_score < 0.3:
            heapq.heappush(false_positive_heap, (cosine_score, i, j, overlap_score))
            false_positive_heap = sorted(false_positive_heap, reverse=True)[:2]

        # False Negative: low cosine, high overlap
        if cosine_score < 0.4 and overlap_score >= 0.6:
            heapq.heappush(false_negative_heap, (overlap_score, i, j, cosine_score))
            false_negative_heap = sorted(false_negative_heap, reverse=True)[:2]

KeyboardInterrupt: 

In [None]:
# Display results
print("TOP 5 SIMILAR ESSAY PAIRS")
top_5 = sorted(top_5_heap, key=lambda x: x[0], reverse=True)
for sim, i, j in top_5:
    print(f"\nEssay {df.iloc[i]['essay_id']} vs Essay {df.iloc[j]['essay_id']}")
    print(f"Cosine Similarity: {sim:.4f}")
    print("Essay 1 Preview:", df.iloc[i]['full_text'][:150].replace('\n', ' '))
    print("Essay 2 Preview:", df.iloc[j]['full_text'][:150].replace('\n', ' '))

print("\nPOTENTIAL FALSE POSITIVES")
for sim, i, j, overlap in false_positive_heap:
    print(f"\nEssay {df.iloc[i]['essay_id']} vs Essay {df.iloc[j]['essay_id']}")
    print(f"Cosine: {sim:.4f}, Overlap: {overlap:.4f}")
    print("Essay 1 Preview:", df.iloc[i]['full_text'][:150].replace('\n', ' '))
    print("Essay 2 Preview:", df.iloc[j]['full_text'][:150].replace('\n', ' '))

print("\nPOTENTIAL FALSE NEGATIVES")
for overlap, i, j, cosine in false_negative_heap:
    print(f"\nEssay {df.iloc[i]['essay_id']} vs Essay {df.iloc[j]['essay_id']}")
    print(f"Overlap: {overlap:.4f}, Cosine: {cosine:.4f}")
    print("Essay 1 Preview:", df.iloc[i]['full_text'][:150].replace('\n', ' '))
    print("Essay 2 Preview:", df.iloc[j]['full_text'][:150].replace('\n', ' '))

TOP 5 SIMILAR ESSAY PAIRS

Essay 0036253 vs Essay e35f6ff
Cosine Similarity: 0.7500
Essay 1 Preview: The challenge of exploring Venus  This storie is about the challeng of exploring Venus. The auhor talked how venus is closest planet in earth. The aut
Essay 2 Preview: The author does not support his idea well that studiying Venus is a worthy pursuit despite the dangers. The author talks about people being facinated 

Essay 0036253 vs Essay 0c0463c
Cosine Similarity: 0.7353
Essay 1 Preview: The challenge of exploring Venus  This storie is about the challeng of exploring Venus. The auhor talked how venus is closest planet in earth. The aut
Essay 2 Preview: Studying Venus seems almost impossible with all the risks, but the author of "The Challenge of Exploring Venus" suggests that Venus is a worthy pursui

Essay 0036253 vs Essay 1284ac4
Cosine Similarity: 0.7326
Essay 1 Preview: The challenge of exploring Venus  This storie is about the challeng of exploring Venus. The auhor talked how ve

### The part below is a way of testing the Cosine similarity and the actual amount of word overlap between the top 5 pairs of essays. This helps give a look to see if the model is actually working well or not.

In [None]:
#This model was inspired by chatGPT and online resources to test the accuracy of the
# word by word model above. I will use this again later to continue
#Testing further models following the same parameters.
'''
def simple_word_overlap(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    if not words1 or not words2:
        return 0.0
    return len(words1 & words2) / len(words1 | words2)

threshold_cosine = 0.85
threshold_overlap = 0.3

for sim, i, j in top_5:
    essay1 = df.iloc[i]['full_text']
    essay2 = df.iloc[j]['full_text']
    overlap_score = simple_word_overlap(essay1, essay2)

    print(f"\nEssay {df.iloc[i]['essay_id']} vs Essay {df.iloc[j]['essay_id']}")
    print(f"Cosine Similarity: {sim:.4f}")
    print(f"Word Overlap Score: {overlap_score:.4f}")

    if sim >= threshold_cosine and overlap_score < threshold_overlap:
        print("Flagged as a potential FALSE POSITIVE (high cosine, low overlap)")
'''


Essay 29139ff vs Essay 6b51c4c
Cosine Similarity: 0.9219
Word Overlap Score: 0.1319
Flagged as a potential FALSE POSITIVE (high cosine, low overlap)

Essay 7d34dd4 vs Essay 9706cf0
Cosine Similarity: 0.8890
Word Overlap Score: 0.0866
Flagged as a potential FALSE POSITIVE (high cosine, low overlap)

Essay 4b79d77 vs Essay dc225ea
Cosine Similarity: 0.8873
Word Overlap Score: 0.1399
Flagged as a potential FALSE POSITIVE (high cosine, low overlap)

Essay 7b69af5 vs Essay 8291f47
Cosine Similarity: 0.8864
Word Overlap Score: 0.1788
Flagged as a potential FALSE POSITIVE (high cosine, low overlap)

Essay 2805912 vs Essay 8291f47
Cosine Similarity: 0.8773
Word Overlap Score: 0.1000
Flagged as a potential FALSE POSITIVE (high cosine, low overlap)
