Importing the module 

In [22]:
import pandas as pd
import string 
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import spacy 

Loading the csv file

In [6]:
commentary_df = pd.read_csv('rcb_commentary.csv')
commentary_df.drop(columns=['match_id','teams','over','timestamp'],inplace=True)
commentary_df.head()

Unnamed: 0,commentary
0,"Johnson to Livingstone, FOUR, emphatic win for..."
1,"Johnson to Livingstone, SIX, Livingstone is in..."
2,"Arora to Livingstone, 1 run, length delivery o..."
3,"Arora to Livingstone, no run, slower-ball on a..."
4,"Arora to Livingstone, FOUR, Livingstone is off..."


Lowercasing

In [13]:
commentary_df['commentary']=commentary_df['commentary'].str.lower()
commentary_df.head()

Unnamed: 0,commentary
0,"johnson to livingstone, four, emphatic win for..."
1,"johnson to livingstone, six, livingstone is in..."
2,"arora to livingstone, 1 run, length delivery o..."
3,"arora to livingstone, no run, slower-ball on a..."
4,"arora to livingstone, four, livingstone is off..."


REMOVE PUNCTUATION

In [14]:
def remove_punctuation(text):
    pattern = re.compile(f"[{re.escape(string.punctuation)}]")
    return pattern.sub(r'',text)

commentary_df['commentary'] = commentary_df['commentary'].apply(remove_punctuation)
commentary_df.head()

Unnamed: 0,commentary
0,johnson to livingstone four emphatic win for r...
1,johnson to livingstone six livingstone is in a...
2,arora to livingstone 1 run length delivery out...
3,arora to livingstone no run slowerball on a le...
4,arora to livingstone four livingstone is off t...


SPELLING CORRECTION

In [15]:
from textblob import TextBlob 

def correct_spelling(text):
    textBLB = TextBlob(text)
    return textBLB.correct().string

commentary_df['commentary'] = commentary_df['commentary'].apply(correct_spelling)
commentary_df.head()

Unnamed: 0,commentary
0,johnson to livingston four emphatic win for ri...
1,johnson to livingston six livingston is in a m...
2,aroma to livingston 1 run length delivery outs...
3,aroma to livingston no run slowerball on a len...
4,aroma to livingston four livingston is off the...


REMOVING STOPWORDS

In [25]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 5.6 MB/s eta 0:00:03
     -------- ------------------------------- 2.6/12.8 MB 10.1 MB/s eta 0:00:02
     ---------------- ----------------------- 5.2/12.8 MB 11.0 MB/s eta 0:00:01
     ----------------------- ---------------- 7.6/12.8 MB 11.5 MB/s eta 0:00:01
     ------------------------------- ------- 10.2/12.8 MB 11.4 MB/s eta 0:00:01
     ----------------------------------- --- 11.5/12.8 MB 10.6 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 10.7 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 9.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and

In [26]:
nlp = spacy.load("en_core_web_sm")

def removing_stopwords(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(filtered_tokens)

commentary_df['commentary']=commentary_df['commentary'].apply(removing_stopwords)
commentary_df.head()

Unnamed: 0,commentary
0,johnson livingston emphatic win rib hammered d...
1,johnson livingston livingston mood kill game i...
2,aroma livingston 1 run length delivery outside...
3,aroma livingston run slowerball length outside...
4,aroma livingston livingston mark boundary spor...


STEMMING

In [27]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
nlp = spacy.load('en_core_web_sm')

def apply_stemming(text):
    
    tokenize_value = nlp(text)
    
    stemmed_words =  [stemmer.stem(token.text) for token in tokenize_value]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

commentary_df['commentary'] = commentary_df['commentary'].apply(apply_stemming)

LEMMATIZATION

In [28]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_punct and not token.is_space ]
    return " ".join(lemmas)

commentary_df['commentary'] = commentary_df['commentary'].apply(lemmatize_text)