In [8]:
from nltk.tokenize import wordpunct_tokenize
from string import punctuation
from typing import List

from nltk.corpus import stopwords
# python -m nltk.downloader stopwords -> run this in your console once to get the stopwords

def preprocess_text(text: str) -> List[str]:
    # tokenize text
    tokens = wordpunct_tokenize(text.lower())

    # remove punctuation
    tokens = [t for t in tokens if t not in punctuation]

    # remove stopwords
    stop_words = stopwords.words("english")
    tokens = [t for t in tokens if t not in stop_words]

    return tokens

In [10]:
from collections import Counter
import math


def calculate_tf(word_counts, total_words):
    # Calculate Term Frequency (TF)
    tf = {}
    for word, count in word_counts.items():
        tf[word] = count / total_words
    return tf

def calculate_idf(word_counts, num_documents):
    # Calculate Inverse Document Frequency (IDF)
    idf = {}
    for word, count in word_counts.items():
        idf[word] = math.log((1 + num_documents) / (1 + count))
    return idf

def create_tf_idf(texts):
    # Count the frequency of each word in the corpus and total number of words
    word_counts = Counter()
    total_words = 0
    for text in texts:
        # Preprocess the text
        words = preprocess_text(text)
        
        # Update word counts and total number of words
        word_counts.update(words)
        total_words += len(words)
    
    # Create sorted vocabulary
    vocabulary = sorted(word_counts.keys())
    
    # Calculate TF-IDF for each document
    tf_idf_vectors = []
    num_documents = len(texts)
    for text in texts:
        # Preprocess the text
        words = preprocess_text(text)
        
        # Calculate TF for the document
        tf = calculate_tf(Counter(words), len(words))
        
        # Calculate IDF based on word counts across all documents
        idf = calculate_idf(word_counts, num_documents)
        
        # Calculate TF-IDF for the document
        tf_idf_vector = {}
        for word in vocabulary:
            tf_idf_vector[word] = round(tf.get(word, 0) * idf[word], 2)
        
        # Sort the IFIDF vector based on the vocabulary order
        sorted_tfidf_vector = [tf_idf_vector[word] for word in vocabulary]
        
        # Append the BoW vector to the list
        tf_idf_vectors.append(sorted_tfidf_vector)
    
    return vocabulary, tf_idf_vectors

# Example texts
texts = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create TF-IDF vectors
vocabulary, tf_idf_vectors = create_tf_idf(texts)

# Print vocabulary
print("Vocabulary:")
print(vocabulary)

# Print TF-IDF vectors
print("\nTF-IDF Vectors:")
for i, tf_idf_vector in enumerate(tf_idf_vectors):
    print(f"Document {i + 1}: {tf_idf_vector}")


Vocabulary:
['document', 'first', 'one', 'second', 'third']

TF-IDF Vectors:
Document 1: [0.0, 0.26, 0.0, 0.0, 0.0]
Document 2: [0.0, 0.0, 0.0, 0.31, 0.0]
Document 3: [0.0, 0.0, 0.46, 0.0, 0.46]
Document 4: [0.0, 0.26, 0.0, 0.0, 0.0]


**Task:** Find some documents and apply this to it.  

**Instructions:**

- Find the closest matching documents.

TODO: Finalize this!