In [None]:
import numpy as np
import re
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

commentary = [
    "Bumrah to Kohli, Yorker, 145 km/h, smashed for a Four!",
    "Shami to Smith, Bouncer, 140 km/h, single taken.",
    "Starc to Rohit, Full Toss, 147 km/h, Six!",
    "Bumrah to Rohit, Yorker, 145 km/h, defended solidly.",
]

def preprocess(text):
    text = text.lower() 
    text = re.sub(r'[^a-z0-9\s]', '', text)  
    return text.split()

tokenized_docs = [preprocess(doc) for doc in commentary]

def compute_tf(doc):
    tf_dict = Counter(doc)
    total_words = len(doc)
    return {word: freq / total_words for word, freq in tf_dict.items()}

tf_documents = [compute_tf(doc) for doc in tokenized_docs]

def compute_idf(doc_list):
    N = len(doc_list) 
    idf_dict = {}
    all_words = set(word for doc in doc_list for word in doc)  

    for word in all_words:
        doc_count = sum(1 for doc in doc_list if word in doc)
        idf_dict[word] = math.log((N + 1) / (doc_count + 1)) + 1  

    return idf_dict

idf_values = compute_idf(tokenized_docs)

def compute_tfidf(tf_docs, idf_values):
    tfidf_documents = []
    for tf_doc in tf_docs:
        tfidf_doc = {word: tf_value * idf_values[word] for word, tf_value in tf_doc.items()}
        tfidf_documents.append(tfidf_doc)
    return tfidf_documents

tfidf_documents = compute_tfidf(tf_documents, idf_values)

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(commentary)

print("\nFeature Names:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X_tfidf.toarray())


Feature Names: ['140' '145' '147' 'bouncer' 'bumrah' 'defended' 'for' 'four' 'full' 'km'
 'kohli' 'rohit' 'shami' 'single' 'six' 'smashed' 'smith' 'solidly'
 'starc' 'taken' 'to' 'toss' 'yorker']
TF-IDF Matrix:
 [[0.         0.31141802 0.         0.         0.31141802 0.
  0.39499427 0.39499427 0.         0.2061244  0.39499427 0.
  0.         0.         0.         0.39499427 0.         0.
  0.         0.         0.2061244  0.         0.31141802]
 [0.39089239 0.         0.         0.39089239 0.         0.
  0.         0.         0.         0.20398387 0.         0.
  0.39089239 0.39089239 0.         0.         0.39089239 0.
  0.         0.39089239 0.20398387 0.         0.        ]
 [0.         0.         0.4027079  0.         0.         0.
  0.         0.         0.4027079  0.21014969 0.         0.31749953
  0.         0.         0.4027079  0.         0.         0.
  0.4027079  0.         0.21014969 0.4027079  0.        ]
 [0.         0.3515001  0.         0.         0.3515001  0.445833