In [94]:
from typing import List, Dict, Tuple, Set
from nltk.tokenize import word_tokenize
import numpy as np

In [95]:
def create_counts(text: List) -> Tuple[List[List[str]], Set[str], int, Dict[str, int]]:
    sentences = []; word_set = []

    for sent in text:
        x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]

        sentences.append(x)
        for word in x:
            word_set.append(word)

    word_set = set(word_set)
    total_documents = len(sentences)

    index_dict = {}
    i = 0
    for word in word_set:
        index_dict[word] = i
        i += 1
    return sentences, word_set, total_documents, index_dict

In [96]:
def word_document_frequency(sentences: List, word_set: set) -> set:
    word_count = {}
    for word in word_set:
        word_count[word] = 0
        for sent in sentences:
            if word in sent:
                word_count[word] += 1
    return word_count

In [97]:
def term_freq(word: str, document: List) -> float:
    return document.count(word) / len(document)

In [98]:
def inversed_term_freq(word: str, n: int, wdf: Dict) -> float:
    return np.log(n / wdf.get(word, 0) + 1)

In [99]:
def tf_idf(
        sentence: List,
        index_dict: Dict,
        total_documents: int,
        wdf: Dict,
        vector_shape: int
    ) -> np.array:
    tf_idf_vec = np.zeros((vector_shape, ))

    for word in sentence:
        tf = term_freq(word, sentence)
        idf = inversed_term_freq(word, total_documents, wdf)
        tf_idf_vec[index_dict[word]] = tf * idf

    return tf_idf_vec

In [100]:
def tf_idf_vectors(text: List) -> np.array:
    vectors = []

    sentences, word_set, total_documents, index_dict = create_counts(text)
    wdf = word_document_frequency(sentences, word_set)
    vector_shape = len(word_set)

    for sent in sentences:
        vec = tf_idf(sent, index_dict, total_documents, wdf, vector_shape)
        vectors.append(vec)
        # vectors = np.array(vectors)

    return np.array(vectors)

In [101]:
text = [
    'This is the first document this.',
    'This one is the second one.',
    'And this is the third.',
    'Is it the first document?'
]

vectors = tf_idf_vectors(text)