In [67]:
# for log values
import math

In [42]:
stories = [
    "The cat sat on the mat. The cat is fluffy.",
    "A dog barked at the mailman. The dog loves to play fetch.",
    "Birds sing in the morning. The cat watches the birds."
]

In [45]:
def calculate_tf(story):
    """Counts how many times each word appears in a story."""
    
    word_counts = {}

    # lowering the words to so it wont count 'the' and 'The' differently
    words = story.lower().replace('.', '').split()

    # for each word in sentence
    for word in words:

        # if that word is present in word_counts
        if word in word_counts:

            # then keep increasing the count
            word_counts[word] += 1
        else:
            word_counts[word] = 1

    return word_counts

In [46]:
calculate_tf(stories[0])

{'the': 3, 'cat': 2, 'sat': 1, 'on': 1, 'mat': 1, 'is': 1, 'fluffy': 1}

In [105]:
def calculate_idf(stories):

    """Counts how many times a word appears in every story."""

    # this is total story in our stories list
    total_docs = len(stories)

    # dictionary for count of words in total
    word_document_count = {}

    # traversing in stories 
    for story in stories:

        # getting unqiue values so used set()
        word_in_story = set(story.lower().replace('.', '').split())

        # traversing each story now which is processed (lower(), replace())
        for word in word_in_story:

            # if the word is in dictionary of count of words, then increment it by 1 or else give it 1
            if word in word_document_count:
                word_document_count[word] += 1
            else:
                word_document_count[word] = 1

    # dictionary for storing idf scores
    idf_scores = {}

    # traversing in word counts dictionary, here we have each word's count in all stories
    for word, word_count in word_document_count.items():

        # updating word's value with below formula 
        # total words / this word's count
        idf_scores[word] = math.log( total_docs / word_count )
   
    return idf_scores

In [106]:
calculate_idf(stories)

{'the': 0.0,
 'on': 1.0986122886681098,
 'fluffy': 1.0986122886681098,
 'is': 1.0986122886681098,
 'sat': 1.0986122886681098,
 'cat': 0.4054651081081644,
 'mat': 1.0986122886681098,
 'fetch': 1.0986122886681098,
 'barked': 1.0986122886681098,
 'play': 1.0986122886681098,
 'mailman': 1.0986122886681098,
 'to': 1.0986122886681098,
 'at': 1.0986122886681098,
 'dog': 1.0986122886681098,
 'a': 1.0986122886681098,
 'loves': 1.0986122886681098,
 'in': 1.0986122886681098,
 'birds': 1.0986122886681098,
 'sing': 1.0986122886681098,
 'watches': 1.0986122886681098,
 'morning': 1.0986122886681098}

In [101]:
def calculate_tfidf(stories):

    # for storing tf (term frequencies / word count)
    tf = []

    # traverse in stories and append their tf values which we get from calculate_tf()
    for story in stories:
        tf.append(calculate_tf(story))

    # traverse in stories and append their idf values which we get from calculate_idf()
    idf_values = calculate_idf(stories)

    # list for storing tfidf scores
    tf_idf_values = []

    # let's go in tf list, where we have tf for each word
    for story_tf in tf:

        # here we will store tfidf value for each story and append this story's scores in tf_idf_values list
        story_tfidf = {}

        # going in story_tf.items() which we got from tf
        for word, word_count in story_tf.items():

            # if word is in idf_values, we get tf value and idf_value for that word and apply formula
            if word in idf_values:
                story_tfidf[word] = word_count * idf_values[word]
            else:
                story_tfidf[word] = 0
        tf_idf_values.append(story_tfidf)
        
    return tf_idf_values

In [102]:
calculate_tfidf(stories)

[{'the': 0.0,
  'cat': 0.8109302162163288,
  'sat': 1.0986122886681098,
  'on': 1.0986122886681098,
  'mat': 1.0986122886681098,
  'is': 1.0986122886681098,
  'fluffy': 1.0986122886681098},
 {'a': 1.0986122886681098,
  'dog': 2.1972245773362196,
  'barked': 1.0986122886681098,
  'at': 1.0986122886681098,
  'the': 0.0,
  'mailman': 1.0986122886681098,
  'loves': 1.0986122886681098,
  'to': 1.0986122886681098,
  'play': 1.0986122886681098,
  'fetch': 1.0986122886681098},
 {'birds': 2.1972245773362196,
  'sing': 1.0986122886681098,
  'in': 1.0986122886681098,
  'the': 0.0,
  'morning': 1.0986122886681098,
  'cat': 0.4054651081081644,
  'watches': 1.0986122886681098}]

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(stories)

tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23 stored elements and shape (3, 20)>

In [97]:
# We can also get the list of words (features) that the columns represent
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['at', 'barked', 'birds', 'cat', 'dog', 'fetch', 'fluffy', 'in',
       'is', 'loves', 'mailman', 'mat', 'morning', 'on', 'play', 'sat',
       'sing', 'the', 'to', 'watches'], dtype=object)