In [1]:
import numpy as np

def tokenize(text):
    """
    Very simple tokenizer:
    - lowercases text
    - splits on whitespace
    """
    return text.lower().split()


def build_vocab(tokenized_docs):
    """
    Build a vocabulary (list of unique terms) from all documents.
    Returns:
      - vocab: list of terms
      - term_index: dict mapping term -> column index
    """
    all_terms = set()
    for doc in tokenized_docs:
        all_terms.update(doc)  # add all tokens from this doc
    
    vocab = sorted(all_terms)  # sorted just to be deterministic
    term_index = {term: i for i, term in enumerate(vocab)}
    return vocab, term_index


def compute_tf_idf(corpus):
    """
    Manually compute TF-IDF
    Returns:
      - tf_idf: 2D numpy array of shape (n_docs, n_terms)
      - vocab: list of terms corresponding to columns of tf_idf
    """
    # 1. Tokenize each document
    tokenized_docs = [tokenize(doc) for doc in corpus]

    # 2. Build vocabulary and term -> index mapping
    vocab, term_index = build_vocab(tokenized_docs)

    n_docs = len(corpus)
    n_terms = len(vocab)

    # 3. Build raw term-count matrix (documents x terms)
    #    counts[d, t] = how many times term t appears in document d
    counts = np.zeros((n_docs, n_terms), dtype=np.float64)

    for d, doc in enumerate(tokenized_docs):
        for term in doc:
            t = term_index[term]
            counts[d, t] += 1

    # 4. Compute Term Frequency (TF)
    #    TF(d, t) = count(d, t) / sum_t' count(d, t')
    doc_lengths = counts.sum(axis=1, keepdims=True)  # shape (n_docs, 1)

    # Avoid division by zero for empty documents using 'where' parameter
    tf = np.divide(counts, doc_lengths, where=doc_lengths != 0)

    # 5. Compute Document Frequency (DF) for each term
    #    DF(t) = number of documents where term t appears at least once
    df = np.count_nonzero(counts > 0, axis=0)  # shape (n_terms,)

    # 6. Compute Inverse Document Frequency (IDF)
    #    IDF(t) = 1 + log( (1 + N) / (1 + DF(t)) )
    #    We add 1 to avoid division by zero and log(0)
    idf = 1.0 + np.log((1.0 + n_docs) / (1.0 + df))

    # 7. Combine TF and IDF
    #    TF-IDF(d, t) = TF(d, t) * IDF(t)
    #    Broadcasting: idf has shape (n_terms,), will be applied to each row.
    tf_idf = tf * idf

    return tf_idf, vocab


In [2]:
corpus = [
    "this is a sample",
    "this is another example example",
    "one more sample example"
]

tf_idf_matrix, vocabulary = compute_tf_idf(corpus)

print("Vocabulary (columns):")
print(vocabulary)
print("\nTF-IDF matrix (rows = docs, cols = terms):")
print(tf_idf_matrix)

Vocabulary (columns):
['a', 'another', 'example', 'is', 'more', 'one', 'sample', 'this']

TF-IDF matrix (rows = docs, cols = terms):
[[0.4232868  0.         0.         0.32192052 0.         0.
  0.32192052 0.32192052]
 [0.         0.33862944 0.51507283 0.25753641 0.         0.
  0.         0.25753641]
 [0.         0.         0.32192052 0.         0.4232868  0.4232868
  0.32192052 0.        ]]
