Importing

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.summarization.summarizer import summarize
from sklearn.metrics import mean_squared_error
from math import sqrt

Adding commentary data

In [2]:
from google.colab import files
uploaded = files.upload()

Saving IPL_Match_Highlights_Commentary.csv to IPL_Match_Highlights_Commentary (1).csv


In [3]:
# Load the csv file
df = pd.read_csv("IPL_Match_Highlights_Commentary.csv")

Preprocessing of data.

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# Convert all text to lowercase
df["comment"] = df["comment"].astype(str).str.lower()

# Remove punctuation
df["comment"] = df["comment"].astype(str).str.replace('[^\w\s]','')

# Remove stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df["comment"] = df["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Tokenize words
df["comment"] = df["comment"].apply(lambda x: word_tokenize(x))

# Lemmatize words
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df["comment"] = df["comment"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Combine the tokenized words into a single string
df["comment"] = df["comment"].apply(lambda x: " ".join(x))

# Create a list of all the preprocessed comments
preprocessed_comments = df["comment"].tolist()

  df["comment"] = df["comment"].astype(str).str.replace('[^\w\s]','')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Split the preprocessed comments into training and testing sets
split_ratio = 0.8
train_size = int(len(preprocessed_comments) * split_ratio)
train_comments = preprocessed_comments[:train_size]
test_comments = preprocessed_comments[train_size:]

# Convert the preprocessed comments into a numerical format
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_comments)
test_vectors = vectorizer.transform(test_comments)


In [8]:
# Train the Word2Vec model on the preprocessed comments
model = gensim.models.Word2Vec(preprocessed_comments, size=1000, window=5, min_count=5, workers=4)




In [9]:
sentences = [['southee', 'bowls', 'a', 'perfect', 'delivery'], ['batsman', 'hits', 'southee', 'for', 'a', 'six']]
model = gensim.models.Word2Vec(sentences, min_count=1, size=100)
sentences = [['gayle', 'bowls', 'a', 'underway', 'delivery'], ['batsman', 'hits', 'southee', 'for', 'a', 'four']]
model = gensim.models.Word2Vec(sentences, min_count=1, size=100)



Adding player names to model vocabulary.

In [10]:
from google.colab import files
uploaded = files.upload()

Saving names.csv to names (1).csv


Adding player names to model vocabulary.

In [11]:
import csv

In [12]:
player_names = []
# Read player names from csv file
with open('names.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        player_names.append(row[0])

In [13]:
# Add player names to vocabulary
model.build_vocab([player_names], update=True)

Training the model on text8 corpus.

In [14]:
import gensim.downloader as api

In [15]:
# Download the text8 corpus
corpus = api.load('text8')

In [16]:
# Add the corpus to the model's vocabulary
model.build_vocab(corpus, update=True)

In [17]:
# Train the model on the corpus
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

(64096283, 85026035)

Training the model on wikipedia corpus.

In [21]:
!pip install wikipedia


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11695 sha256=8a1d88f71aa161346c6a5a04d24f2bbe491340292957cc78a5b7bf165caf3c83
  Stored in directory: /root/.cache/pip/wheels/07/93/05/72c05349177dca2e0ba31a33ba4f7907606f7ddef303517c6a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [29]:
import logging
import wikipedia
from gensim.corpora import WikiCorpus
from gensim.utils import simple_preprocess
import requests
import bz2

In [30]:
# Set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


this code downloads a large file (over 16 GB in size) from the internet, so it may take several hours to complete depending on the above-mentioned factors. It is important to note that downloading large files can consume a significant amount of bandwidth and can affect the internet speed for other users on the same network.


In [31]:
# Download a Wikipedia dump file in XML format
wiki_dump_file = 'enwiki-latest-pages-articles.xml.bz2'
url = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2'
r = requests.get(url, stream=True)
with open(wiki_dump_file, 'wb') as f:
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            f.write(chunk)

In [None]:
# Use WikiCorpus to extract text from the Wikipedia dump
wiki_corpus = WikiCorpus(wiki_dump_file, dictionary={})
sentences = list(wiki_corpus.get_texts())

In [None]:
# Preprocess the text data
preprocessed_sentences = [simple_preprocess(sentence) for sentence in sentences]


In [None]:
# Train a Word2Vec model using the preprocessed text data
model = gensim.models.Word2Vec(preprocessed_sentences, min_count=5, workers=4)

Function to generate summary.

In [None]:
# Define a function to generate summaries for a given match
def generate_summary(match_id):
    # Get all the preprocessed comments for the given match
    comments = df.loc[df["Match_id"] == match_id, "comment"].tolist()
    
    # Convert the preprocessed comments to vectors using the Word2Vec model
    comment_vectors = [model.wv[comment.split()] for comment in comments]
    
    # Summarize the comments using the TextRank algorithm
    summary = summarize(" ".join(comments), ratio=0.2)
    
    return summary

Generate summary for each match.

In [None]:
# Generate summaries for all matches in the test set
predicted_summaries = []
for Match_id in df.loc[train_size:, "Match_id"].unique():
    summary = generate_summary(Match_id)
    predicted_summaries.append(summary)

print(predicted_summaries)


Performance measurement.

In [None]:
# Evaluate the model using mean squared error
actual_summaries = df.loc[train_size:, "comment"].tolist()
mse = mean_squared_error(actual_summaries, predicted_summaries)
rmse = sqrt(mse)
print("Root Mean Squared Error:", rmse)