In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [7]:
# Preprocess text requirements
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukec\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukec\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
np.random.seed(42)

In [9]:
# Load data
data = pd.read_csv('nlp_project_train.csv')
essays = data['full_text']
scores = data['score']  # Scores 1-6

In [10]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return words # Needs to be a list of lists for word_overlap to work
processed_essays = [preprocess_text(essay) for essay in essays]

# Method 1: Pure Word Frequency

In [None]:
from collections import defaultdict

def word_overlap(essay1, essay2):
    # Count word frequencies in each essay
    freq1 = defaultdict(int)
    for word in essay1:
        freq1[word] += 1
    
    freq2 = defaultdict(int)
    for word in essay2:
        freq2[word] += 1
    
    # Calculate the sum of minimum frequencies for common words
    common_sum = 0
    for word in freq1:
        if word in freq2:
            common_sum += min(freq1[word], freq2[word])
    
    # Calculate the total number of words in both essays
    total_words = sum(freq1.values()) + sum(freq2.values())
    
    # The overlap is the ratio of common_sum to total_words
    return (2 * common_sum) / total_words if total_words else 0 # Avoid division by zero, 2 times to make sure similarity is from0-1 not 0-0.5


In [None]:
# Test of functionality
start_time = time.time()
similarity_score = word_overlap(processed_essays[0], processed_essays[1])
end_time = time.time()
print(f"Time taken for word overlap similarity: {end_time - start_time:.4f} seconds")
print(f"Word overlap similarity: {100 * similarity_score:.2f}%")

Time taken for word overlap similarity: 0.0050 seconds
Word overlap similarity: 9.60%


In [None]:
# Create a similarity matrix
num_essays = len(processed_essays)
similarity_matrix = np.zeros((num_essays, num_essays))

# Compute word overlap similarity for pairs of essays above the diagonal
for i in range(num_essays):
    for j in range(i + 1, num_essays):  # Only compute for j > i
        similarity_matrix[i, j] = word_overlap(processed_essays[i], processed_essays[j])
# pd.DataFrame(similarity_matrix).to_csv('similarity_matrix.csv', index=False)