In [None]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Define the summarization pipeline
summarization_pipeline = pipeline("summarization")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
def summarize_over(over):
    over_text = " ".join(over)
    summary = summarization_pipeline(over_text, max_length=100, min_length=10, do_sample=False)[0]['summary_text']
    return summary

def highlights(text_lines, n):
    interesting_scores = calculate_interesting_scores(text_lines)
    sorted_scores = sorted(interesting_scores.items(), key=lambda x: x[1], reverse=True)
    top_n_overs = dict(sorted_scores[:n])
    highlights = {}
    for over_number in top_n_overs.keys():
        over_text = text_lines[over_number - 1]
        summary = summarize_over(over_text)
        highlights[over_number] = summary
    return highlights

def calculate_interesting_scores(text_lines):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def preprocess_text(text_list):
        preprocessed_text = []
        for text in text_list:
            tokens = word_tokenize(text.lower())
            tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
            preprocessed_text.append(tokens)
        return preprocessed_text

    def wordscore(text_lines):
        scores = {}
        for i, sublist in enumerate(text_lines):
            preprocessed_commentary = preprocess_text(sublist)
            word2vec_model = Word2Vec(preprocessed_commentary, vector_size=100, window=5, min_count=1, workers=4)
            cricket_related_terms = ["six", "four", "boundary", "wicket", "catch", "stump"]
            related_words = {}
            for term in cricket_related_terms:
                if term in word2vec_model.wv.key_to_index:
                    similar_words = word2vec_model.wv.most_similar(term, topn=5)
                    related_words[term] = [word for word, _ in similar_words]
            count_related_words = sum(len(words) for words in related_words.values())
            scores[i + 1] = count_related_words
        return scores

    def sentimentscore(text_lines):
        sia = SentimentIntensityAnalyzer()
        over_sentiment_scores = {}
        for over_number, over in enumerate(text_lines, start=1):
            over_text = ' '.join(over)
            sentiment_score = sia.polarity_scores(over_text)["compound"]
            over_sentiment_scores[over_number] = sentiment_score
        return over_sentiment_scores

    sentiment_scores = sentimentscore(text_lines)
    word_scores = wordscore(text_lines)

    min_sentiment = min(sentiment_scores.values())
    max_sentiment = max(sentiment_scores.values())
    sentiment_range = max_sentiment - min_sentiment
    normalized_sentiments = {over: (score - min_sentiment) / sentiment_range
                             for over, score in sentiment_scores.items()}

    min_word_score = min(word_scores.values())
    max_word_score = max(word_scores.values())
    word_score_range = max_word_score - min_word_score
    normalized_word_scores = {over: (score - min_word_score) / word_score_range
                              for over, score in word_scores.items()}

    interesting_scores = {over: normalized_sentiments[over] + normalized_word_scores[over]
                          for over in sentiment_scores.keys()}

    return interesting_scores

def v_and_s(text_lines, n):
    interesting_scores = calculate_interesting_scores(text_lines)
    highlighted_overs = highlights(text_lines, n)
    sorted_highlights = sorted(highlighted_overs.items(), key=lambda x: interesting_scores[x[0]], reverse=True)
    print("Highlighted Overs:")
    for over, summary in sorted_highlights:
        print(f"Over {over}: {summary}\n")

In [None]:
text_lines = [
    ["on the pads to start from Amir, no swing, worked down to fine leg to get off the mark.",
    "drifts down leg this time, no swing whatsoever. Warner misses his flick",
    "off the outside half and that races away. Better channel, outside off on a good length. Warner prods and gets it down to third man. Yasir can't cut it off from point",
    "leg side-ish again, 140 kph. Amir looking for swing but there is none there. Off the pad to fine leg",
    "much better. 139 kph, coming back in on off, pushed to mid-on",
    "138 kph, nice channel. Around off, full. Renshaw watches the ball onto his bat, dabbed towards point",
    ],
    [
     "nicely bowled. Full around off at 130 kph, Warner blocks to mid-off",
    "short and wide, poor ball. Warner goes back and slaps the ball through cover for four. Easy as ... Not much swing, just timed",
    "superb batting. Good length outside off, sits up for Warner to stay back and punch through cover with just a short-arm jab. Signs of a good surface and a batsman in sublime touch",
    "no bother says Warner. Gets a good length delivery outside off, Warner square-punches through extra cover. Yasir Shah gets across and saves one",
    "nicely bowled. 132 kph, full and straight. Off the outside half to gully",
    "nice end, 132 kph outside off, no stroke offered",
    ],
    [
        "beauty. Some swing at last. Goes wide of the crease, bowls it full just outside off. Warner is lulled into playing but the ball pitches and moves away to beat the bat. Superb",
    "141 kph, tight around off. Defended into the off side",
    "oh no. This has been an awful start from Pakistan. Full outside off from Amir, stay there. Warner drives straight to Misbah at mid-off. The ball goes through Misbah's legs and for four",
    "around off on a full length, defended to mid-on",
    "more runs. On Warner's pads this time and Warner says thanks with a flick through midwicket. Not even three overs done and we can already see who is the alpha in this series",
    "tight around off at 143 kph, blocked",
    ],
    [
        "angled away outside off, left alone",
    "pushed wider outside off, waaaaay too wide to draw a stroke, left alone",
    "136 kph, full outside off, driven to mid-off",
    "134 kph, yorker outside off, jammed out",
    "angled away again, left alone",
    "full and wide, another leave, Some control, a maiden",
    ],
    [
        "140 kph, on the pads, worked to fine leg for a couple. Warner hurries back for two, makes it with time to spare",
    "excellent change-up. The most impressive factor was the direction, right at Warner's neck. He ducks out of the line, has a smile on his face after getting himself off the pitch",
    "like this length, short again at off. Warner sways out this time",
    "prodded into the off side, gets one towards cover",
    "139 kph, back of a length outside off. Renshaw bails his bat out. A good leave",
    "139 kph, good line around off, defended back",
    ],
    ["lovely batting. Full and wide outside off, Warner reaches out for it and lifts the ball over extra cover. Cover is there? No problem. I'll go over the top. It's the first day of a Test match.",
    "good length on off, 137 kph, defended into the off side",
    "134 kph, short and wide. Quite a bad ball to Warner. He chops one back towards and over the stumps. All safe for Warner",
    "nicely bowled. No swing but at least the line is good. Full, straight, blocked",
    "finds the gap for four more. Warner is in some touch. Short ball on leg, Warner takes it on, gets on top of the bounce and plays the ball in the gap at backward square leg. Bisects the field.",
    "racing away. He's 38 now. Six overs bowled. Good length outside off, a chopped drive-punch through extra cover.",
    ],
    [
     "full and straight, flicked through square leg for four more. Good from Renshaw, keeping his own amid some fantastic batting",
    "141 kph, excellent single. Full on off, dabbed towards extra cover for one. Warner will want the strike and Renshaw gives it",
    "143 kph, Amir goes wide of the crease, good length on off, pushed towards cover",
    "slower ball, 126 kph, Warner drives to wide mid-off. Amir looking to get Warner splicing a drive in the air and for that, the <b>length on that ball was fantastic</b>. Good thinking",
    "bouncer on off, Renshaw avoids with a duck",
    "140 kph, full outside off, played down towards gully",
    ],
    [
        "137 kph, good length on off, no swing. Dead straight, defended",
    "134 kph, blocked to deep point for one",
    "back of a length on off, Renshaw gets on top of the bounce to defend",
    "good length outside off, left alone",
    "pushed fuller and slightly closer to off. Sixth stump. Renshaw shoulders arms",
    "another leave outside off",
    ],
    [
        "four more. Back of a length on off, sits up for Warner to swat in front of square. There is a man deep there, he doesn't care. So much confidence.",
    "back of a length outside off, Warner lets that go with a sway. Whaaat?",
    "back of a length on off, dabbed towards gully, who has to dive to save some runs",
    "crashed behind point. Not wide enough to cut. Warner stays inside the line, opens the face and guides his cut for four. That's something Warner has developed, he can even cut straight balls, saw it first in the IPL this year",
    "full and wide this time. Warner's feet are not close to the ball, the ball flies past the edge. The first loose shot of the day",
    "brings this one back in on off, blocked to mid-off",
    ],
    [
        "142 kph, Renshaw lets this one go",
    "141 kph, tight around off, defended into the off side",
    "full and wide outside off, no stroke offered",
    "good length, wide outside off. Left alone",
    "another one that is left alone outside off",
    "left alone, a maiden"
]
]

In [None]:
n = 5
v_and_s(text_lines, n)

Highlighted Overs:
Over 6:  Warner takes it on, gets on top of the bounce and plays the ball in the gap at backward square leg . He chops one back towards and over the stumps . Warner is 38 now. lovely batting .

Over 7:  Renshaw gives it 143 kph, Amir goes wide of the crease, good length on off, pushed towards cover slower ball . Warner drives to wide mid-off for four more . Amir looking to get Warner splicing a drive in the air and for that, the length on that ball was fantastic .

Over 2:  Yasir Shah gets across and saves one nicely bowled . Warner square-punches through extra cover with just a short-arm jab . Warner goes back and slaps the ball through cover for four . Not much swing, just timed superb batting .

Over 3:  Warner drives straight to Misbah at mid-off with a flick through mid-on . Amir bowls full outside off at 143 kph . Warner drives for four around off on a full length full length .

Over 5:  The most impressive factor was the direction right at Warner's neck . 140 