In [34]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
file_path = 'IPL_Match_Highlights_Commentary.csv'
df = pd.read_csv(file_path)
df.columns = df.columns.str.strip()
print("Columns in dataset:", df.columns)
if 'Commentary' not in df.columns:
    raise KeyError("Column 'Commentary' not found in the dataset")
text_data = df['Commentary'].dropna().tolist()
def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text
processed_texts = [preprocess(text) for text in text_data]
def compute_tf(text):
    words = text.split()
    word_count = Counter(words)
    total_words = len(words)
    return {word: count / total_words for word, count in word_count.items()}
document_count = len(processed_texts)
df_counts = Counter()
for text in processed_texts:
    words = set(text.split())
    df_counts.update(words)
idf = {word: math.log(document_count / (df_counts[word] + 1)) + 1 for word in df_counts}
corpus_tfidf = []
for text in processed_texts:
    tf = compute_tf(text)
    tfidf = {word: tf[word] * idf[word] for word in tf}
    corpus_tfidf.append(tfidf)
words = list(idf.keys())
tfidf_matrix = np.zeros((document_count, len(words)))
word_to_index = {word: i for i, word in enumerate(words)}
for i, tfidf in enumerate(corpus_tfidf):
    for word, value in tfidf.items():
        tfidf_matrix[i, word_to_index[word]] = value
tfidf_df = pd.DataFrame(tfidf_matrix, columns=words)
print("\nTF-IDF from scratch:")
print(tfidf_df.head())
vectorizer = TfidfVectorizer()
sklearn_tfidf_matrix = vectorizer.fit_transform(processed_texts).toarray()
sklearn_tfidf_df = pd.DataFrame(sklearn_tfidf_matrix, columns=vectorizer.get_feature_names_out())
print("\nTF-IDF using scikit-learn:")
print(sklearn_tfidf_df.head())


Columns in dataset: Index(['Match_id', 'Team', 'Over_num', 'Commentary', 'batsman', 'score'], dtype='object')

TF-IDF from scratch:
        for       and        it   picked      over        up    couple  \
0  0.046452  0.136059  0.065347  0.12351  0.049643  0.064053  0.134495   
1  0.000000  0.043539  0.052277  0.00000  0.039715  0.051242  0.000000   
2  0.000000  0.051832  0.031117  0.00000  0.000000  0.061003  0.000000   
3  0.000000  0.031099  0.000000  0.00000  0.000000  0.000000  0.000000   
4  0.029969  0.035112  0.021080  0.00000  0.032028  0.041324  0.000000   

    mandeep        on  boundary  ...  wrath  uhoh  latent  microsix  whodve  \
0  0.406116  0.047905  0.084598  ...    0.0   0.0     0.0       0.0     0.0   
1  0.216595  0.000000  0.000000  ...    0.0   0.0     0.0       0.0     0.0   
2  0.000000  0.000000  0.000000  ...    0.0   0.0     0.0       0.0     0.0   
3  0.000000  0.000000  0.000000  ...    0.0   0.0     0.0       0.0     0.0   
4  0.000000  0.000000  0.000