In [19]:
from nltk.tokenize import word_tokenize
import numpy as np

In [20]:
sample_text = ['''Topic sentences are similar to mini thesis statements. Like a thesis statement', 'a topic sentence has a specific main point. Whereas the thesis is the main point of the essay, the topic sentence is the main point of the paragraph.               Like the thesis statement, a topic sentence has a unifying function. 
But a thesis statement or topic sentence alone doesn’t guarantee unity.', 'An essay is unified if all the paragraphs relate to the thesis,'whereas a paragraph is unified if all the sentences relate to the topic sentence.''']

In [21]:
sentences = []
word_set = []

for sent in sample_text:
    words = [word.lower() for word in word_tokenize(sent) if word.isalpha()]
    sentences.append(words)
    for word in words:
        if word not in word_set:
            word_set.append(word)
# Set of words
word_set = set(word_set)
# total documents in our corpus
total_docs = len(sample_text)
print('Total documents: ', total_docs)
print('Total words: ', len(word_set))

Total documents:  1
Total words:  36


In [22]:
word_index = {}
for i, word in enumerate(word_set):
    word_index[word] = i

In [23]:
def count_dict(sentences):
    count_dict = {}
    for word in word_set:
        count_dict[word] = 0
    for sent in sentences:
        for word in sent:
            count_dict[word] += 1
    return count_dict
word_count = count_dict(sentences)
print(word_count)

{'sentences': 2, 'essay': 2, 't': 1, 'doesn': 1, 'relate': 2, 'are': 1, 'mini': 1, 'a': 7, 'if': 2, 'to': 3, 'topic': 6, 'alone': 1, 'guarantee': 1, 'paragraph': 2, 'paragraphs': 1, 'main': 3, 'statements': 1, 'or': 1, 'has': 2, 'thesis': 6, 'point': 3, 'all': 2, 'function': 1, 'the': 11, 'specific': 1, 'unified': 2, 'similar': 1, 'sentence': 5, 'whereas': 1, 'like': 2, 'statement': 3, 'unity': 1, 'unifying': 1, 'is': 4, 'but': 1, 'of': 2}


In [24]:
def term_frequency(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])
    return occurance / N

In [25]:
def inverse_document_frequency(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(total_docs / word_occurance)

In [26]:
def tf_idf(sentence):
    vec = np.zeros((len(word_set),))
    for word in sentence:
        tf = term_frequency(sentence, word)
        idf = inverse_document_frequency(word)
        vec[word_index[word]] = tf * idf
    return vec

In [27]:
vectors = []
for sent in sentences:
    vectors.append(tf_idf(sent))

print(vectors)

[array([-0.02525545, -0.02525545, -0.00796721, -0.00796721, -0.02525545,
       -0.00796721, -0.00796721, -0.16731139, -0.02525545, -0.04780325,
       -0.1342007 , -0.00796721, -0.00796721, -0.02525545, -0.00796721,
       -0.04780325, -0.00796721, -0.00796721, -0.02525545, -0.1342007 ,
       -0.04780325, -0.02525545, -0.00796721, -0.3141836 , -0.00796721,
       -0.02525545, -0.00796721, -0.10297468, -0.00796721, -0.02525545,
       -0.04780325, -0.00796721, -0.00796721, -0.07399715, -0.00796721,
       -0.02525545])]
