In [28]:
%pip install numpy nltk

Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import re
import numpy as np
from nltk.stem import PorterStemmer
import math

# Load the stopwords
with open('stopwords.txt', 'r') as f:
    stop_words = set(f.read().splitlines())

ps = PorterStemmer()

def process_document(text):
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    translator = str.maketrans('', '', '''!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~''')
    text = text.translate(translator)
    tokens = text.split()
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Remove single-character words and empty strings
    tokens = [word for word in tokens if len(word) > 1]
    # Stem the tokens
    tokens = [ps.stem(word) for word in tokens]
    return tokens


In [3]:
# Initialize a dictionary for document frequency
document_frequency = {}

# Iterate through all the files in the dataset directory
for filename in os.listdir('./data'):
    if filename.endswith('.txt'):
        filepath = os.path.join('./data', filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)

        # Update document frequency - only count each term once per document
        unique_tokens = set(tokens)
        for token in unique_tokens:
            if token in document_frequency:
                document_frequency[token] += 1
            else:
                document_frequency[token] = 1

# Sort the terms in ascending order
sorted_terms = sorted(document_frequency.items(), key=lambda x: x[0])

# Save the dictionary and document frequency to a file
with open('dictionary.txt', 'w') as f:
    for index, (term, df) in enumerate(sorted_terms, start=1):
        f.write(f"{index}\t{term}\t{df}\n")

In [4]:
# Build a term index dictionary for easy lookup
term_index = {term: index for index, (term, df) in enumerate(sorted_terms, start=1)}

# Now compute the tf-idf vectors for each document
for filename in os.listdir('./data'):
    if filename.endswith('.txt'):
        filepath = os.path.join('./data', filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)
        
        # Compute term frequency
        tf = {}
        for token in tokens:
            if token in tf:
                tf[token] += 1
            else:
                tf[token] = 1
        
        # Create a zero vector of length equal to the number of terms
        tfidf_vector = np.zeros(len(term_index))
        
        for term, freq in tf.items():
            if term in term_index:
                tf_t = freq
                df_t = document_frequency[term]
                N = len(os.listdir('./data'))  # Assuming all files in the dataset directory are text documents
                idf_t = math.log10(N / df_t)
                tfidf_t = tf_t * idf_t
                tfidf_vector[term_index[term] - 1] = tfidf_t  # -1 because indices start from 1
        
        # Normalize the tf-idf vector to unit length
        norm = np.linalg.norm(tfidf_vector)
        if norm > 0:
            tfidf_vector_unit = tfidf_vector / norm
        else:
            tfidf_vector_unit = tfidf_vector  # avoid division by zero

        # Get non-zero entries for the sparse representation
        non_zero_entries = [(index + 1, tfidf) for index, tfidf in enumerate(tfidf_vector_unit) if tfidf > 0]

        # Save the tf-idf unit vector to a file
        doc_id = os.path.splitext(filename)[0]  # Assuming filename is 'DocID.txt'
        with open(f'./output/{doc_id}.txt', 'w') as f:
            f.write(f"{len(non_zero_entries)}\n")  # Write the number of non-zero entries
            for index, tfidf in non_zero_entries:
                f.write(f"{index}\t{tfidf:.3f}\n")  # Write the term index and tf-idf value, formatted to 3 decimal places


In [31]:
def cosine(docx, docy):
    # Inline function to load vector
    def load(doc_id):
        with open(f'./output/{doc_id}.txt', 'r') as f:
            lines = f.readlines()
        vector = np.zeros(len(term_index))  # Ensure all vectors are of same length as term_index
        for line in lines[1:]:
            index, tfidf = line.strip().split()
            index = int(index) - 1  # Indices start from 1 in the file
            tfidf = float(tfidf)
            vector[index] = tfidf
        return vector
    
    vector_x = load(docx)
    vector_y = load(docy)
    
    # The vectors are already normalized (unit vectors), so just compute the dot product.
    cosine_similarity = np.dot(vector_x, vector_y)
    
    return cosine_similarity

# Example usage:
docx = '1'
docy = '2'
similarity = cosine(docx, docy)
print(f'Cosine similarity between {docx} and {docy}: {similarity:.3f}')

Cosine similarity between 1 and 2: 0.195
