In [5]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer





In [6]:
# Load the dataset
df = pd.read_csv("IPL_Match_Highlights_Commentary.csv", encoding="latin-1")

In [7]:
# Extract relevant column
text_data = df['Commentary'].dropna().str.lower()

In [8]:

# Preprocessing: Tokenization and Removing Special Characters
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    words = text.split()
    return words

In [9]:

# Compute Term Frequency (TF)
def compute_tf(corpus):
    tf_values = []
    for doc in corpus:
        word_count = Counter(doc)
        total_words = len(doc)
        tf_values.append({word: count / total_words for word, count in word_count.items()})
    return tf_values

In [10]:
# Compute Inverse Document Frequency (IDF)
def compute_idf(corpus):
    num_docs = len(corpus)
    word_set = set(word for doc in corpus for word in doc)
    idf_values = {}
    for word in word_set:
        containing_docs = sum(1 for doc in corpus if word in doc)
        idf_values[word] = np.log((num_docs + 1) / (containing_docs + 1)) + 1
    return idf_values

In [11]:
# Compute TF-IDF
def compute_tfidf(tf_values, idf_values):
    tfidf_values = []
    for tf in tf_values:
        tfidf_values.append({word: tf[word] * idf_values[word] for word in tf})
    return tfidf_values


In [12]:
# Apply preprocessing
processed_corpus = text_data.apply(preprocess_text).tolist()

# Compute TF, IDF, and TF-IDF
tf_values = compute_tf(processed_corpus)
idf_values = compute_idf(processed_corpus)
tfidf_values = compute_tfidf(tf_values, idf_values)


In [13]:
# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame.from_records(tfidf_values).fillna(0)

# Display TF-IDF values
print("TF-IDF values (computed from scratch):")
print(tfidf_df.head())

TF-IDF values (computed from scratch):
      nehra        to   mandeep      four     first  boundary       for  \
0  0.159658  0.050000  0.406122  0.043215  0.097068    0.0846  0.046455   
1  0.255452  0.080000  0.216598  0.034572  0.077654    0.0000  0.000000   
2  0.000000  0.071429  0.000000  0.041157  0.000000    0.0000  0.000000   
3  0.364932  0.028571  0.000000  0.049388  0.000000    0.0000  0.000000   
4  0.206010  0.064516  0.000000  0.055761  0.000000    0.0000  0.029971   

        and       rcb      full  ...  uhoh  latent  wrath  microsix  whodve  \
0  0.136070  0.114943  0.072331  ...   0.0     0.0    0.0       0.0     0.0   
1  0.043542  0.000000  0.000000  ...   0.0     0.0    0.0       0.0     0.0   
2  0.051836  0.000000  0.000000  ...   0.0     0.0    0.0       0.0     0.0   
3  0.031102  0.000000  0.082664  ...   0.0     0.0    0.0       0.0     0.0   
4  0.035115  0.000000  0.000000  ...   0.0     0.0    0.0       0.0     0.0   

   outunorthodox  paddlepulls  expr

In [14]:
# Verify using scikit-learn's TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
sklearn_tfidf = tfidf_vectorizer.fit_transform(text_data)
sklearn_tfidf_df = pd.DataFrame(sklearn_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF values using scikit-learn:")
print(sklearn_tfidf_df.head())



TF-IDF values using scikit-learn:
   000   07   10  100  1000  100kph  100ks  100th  101  101kph  ...  zipping  \
0  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   
1  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   
2  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   
3  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   
4  0.0  0.0  0.0  0.0   0.0     0.0    0.0    0.0  0.0     0.0  ...      0.0   

   zips  zone  zones  zoning  zoomed  zoomer  zooming  zooms  zoots  
0   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  
1   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  
2   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  
3   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  
4   0.0   0.0    0.0     0.0     0.0     0.0      0.0    0.0    0.0  

[5 rows x 9412 columns]
