Importing the module 

In [2]:
import pandas as pd
import string 
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import spacy 

Loading the csv file

In [3]:
commentary_df = pd.read_csv('rcb_commentary.csv')
commentary_df.drop(columns=['match_id','teams','over','timestamp'],inplace=True)
commentary_df.head()

Unnamed: 0,commentary
0,"Johnson to Livingstone, FOUR, emphatic win for..."
1,"Johnson to Livingstone, SIX, Livingstone is in..."
2,"Arora to Livingstone, 1 run, length delivery o..."
3,"Arora to Livingstone, no run, slower-ball on a..."
4,"Arora to Livingstone, FOUR, Livingstone is off..."


Lowercasing

In [4]:
commentary_df['commentary']=commentary_df['commentary'].str.lower()
commentary_df.head()

Unnamed: 0,commentary
0,"johnson to livingstone, four, emphatic win for..."
1,"johnson to livingstone, six, livingstone is in..."
2,"arora to livingstone, 1 run, length delivery o..."
3,"arora to livingstone, no run, slower-ball on a..."
4,"arora to livingstone, four, livingstone is off..."


REMOVE PUNCTUATION

In [5]:
def remove_punctuation(text):
    pattern = re.compile(f"[{re.escape(string.punctuation)}]")
    return pattern.sub(r'',text)

commentary_df['commentary'] = commentary_df['commentary'].apply(remove_punctuation)
commentary_df.head()

Unnamed: 0,commentary
0,johnson to livingstone four emphatic win for r...
1,johnson to livingstone six livingstone is in a...
2,arora to livingstone 1 run length delivery out...
3,arora to livingstone no run slowerball on a le...
4,arora to livingstone four livingstone is off t...


SPELLING CORRECTION (No need for spelling correction as it will just butcher the indian names )

REMOVING STOPWORDS

In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 644.1 kB/s eta 0:00:20
     - ------------------------------------- 0.5/12.8 MB 644.1 kB/s eta 0:00:20
     - ------------------------------------- 0.5/12.8 MB 644.1 kB/s eta 0:00:20
     --- ----------------------------------- 1.0/12.8 MB 774.0 kB/s eta 0:00:16
     --- ----------------------------------- 1.0/12.8 MB 774.0 kB/s eta 0:00:16
     --- ----------------------------------- 1.3/12.8 MB 789.0 kB/s eta 0:00:15
     --- ----------------------------------- 1.3/12.8 MB

In [7]:
nlp = spacy.load("en_core_web_sm")

def removing_stopwords(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(filtered_tokens)

commentary_df['commentary']=commentary_df['commentary'].apply(removing_stopwords)
commentary_df.head()

Unnamed: 0,commentary
0,johnson livingstone emphatic win rcb hammered ...
1,johnson livingstone livingstone mood kill game...
2,arora livingstone 1 run length delivery outsid...
3,arora livingstone run slowerball length outsid...
4,arora livingstone livingstone mark boundary sp...


STEMMING

In [10]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
nlp = spacy.load('en_core_web_sm')

def apply_stemming(text):
    
    tokenize_value = nlp(text)
    
    stemmed_words =  [stemmer.stem(token.text) for token in tokenize_value]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

commentary_df['commentary'] = commentary_df['commentary'].apply(apply_stemming)
commentary_df.head()

Unnamed: 0,commentary
0,johnson livingston emphat win rcb hammer kkr d...
1,johnson livingston livingston mood kill game i...
2,arora livingston 1 run length deliveri outsid ...
3,arora livingston run slowerb length outsid liv...
4,arora livingston livingston mark boundari spor...


LEMMATIZATION

In [11]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_punct and not token.is_space ]
    return " ".join(lemmas)

commentary_df['commentary'] = commentary_df['commentary'].apply(lemmatize_text)
commentary_df.head()

Unnamed: 0,commentary
0,johnson livingston emphat win rcb hammer kkr d...
1,johnson livingston livingston mood kill game i...
2,arora livingston 1 run length deliveri outsid ...
3,arora livingston run slowerb length outsid liv...
4,arora livingston livingston mark boundari spor...


Explicitly stating all the batters name as NER is not very effective in this scenario. 

In [12]:
rcb_batters_2025 = [
    "virat kohli",
    "rajat patidar",
    "phil salt",
    "jitesh sharma",
    "mayank agarwal",
    "tim david",
    "jacob bethell",
    "devdutt padikkal",
    "liam livingstone",
    "krunal pandya",
    "romario shepherd"
]


Lookup Based Approach 

In [13]:
from spacy.matcher import PhraseMatcher

# Create patterns for PhraseMatcher
patterns = [nlp.make_doc(name) for name in rcb_batters_2025]
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("RCB_BATTERS", patterns)


Matching the created patterns 

In [14]:
def match_batters(text):
    doc = nlp(text)
    matches = matcher(doc)
    return [doc[start:end].text for match_id, start, end in matches]

commentary_df['commentary']=commentary_df['commentary'].apply(match_batters)