In [None]:
!pip install nltk

In [17]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en import English
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from termcolor import colored
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load spaCy English model (download if not already installed)
nlp = English()

# Load the CSV data
df = pd.read_csv("/kaggle/input/tennis-csv/tennis.csv")

# Preprocess the text data
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Convert tokens back to a string
    return ' '.join(tokens)

df['article_text'] = df['article_text'].apply(preprocess_text)

# Tokenize sentences
sentences = []
for s in df['article_text']:
    sentences.append(sent_tokenize(s))

# Flatten the list of sentences
sentences = [y for x in sentences for y in x]

# Load word embeddings (replace with your actual path and file)
word_embeddings = {}
with open('/kaggle/input/d/nithinreddy90/glove6b/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

# Function to create sentence vectors (handles zero-length sentences)
def create_sentence_vector(sentence: str):
    if len(sentence.split()) == 0:
        return np.zeros((300,))
    else:
        word_vectors = [word_embeddings.get(word, np.zeros((300,))) for word in sentence.split()]
        return np.sum(word_vectors, axis=0) / (len(word_vectors) + 0.001)  # Avoid division by zero

# Create sentence vectors
sentence_vectors = [create_sentence_vector(sentence) for sentence in sentences]

# Calculate cosine similarity matrix
sim_mat = cosine_similarity(sentence_vectors)

# Create a network graph from the similarity matrix
G = nx.from_numpy_array(sim_mat)

# Calculate PageRank scores for sentences
scores = nx.pagerank(G)

# Rank sentences based on PageRank scores
ranked_sentences = sorted(
    ((score, sentence) for score, sentence in zip(scores.values(), sentences)),
    reverse=True)

# Print ranked sentences
print("Ranked sentences (higher score indicates more relevant):")
for score, sentence in ranked_sentences:
    print(f"- Score: {score:.4f}, Sentence: {sentence}")

# Print a random article and its summary
i = random.randint(0, len(df))

print(colored("ARTICLE".center(50), 'yellow'))
print('\n')
print(colored(df['article_text'][i], 'blue'))
print('\n')
print(colored("SUMMARY:".center(50), 'green'))
print('\n')
print(colored(' '.join([sentence for score, sentence in ranked_sentences[:5]]), 'cyan')) 

Ranked sentences (higher score indicates more relevant):
- Score: 0.0093, Sentence: `` Clearly make life difficult , still I 6-2 , 3-1 , break points , things could ended quickly today , even though I n't best serve percentage stats .
- Score: 0.0092, Sentence: Speaking Swiss Indoors tournament play Sundays final Romanian qualifier Marius Copil , world number three said given impossibly short time frame make decision , opted commitment .
- Score: 0.0091, Sentence: Major players feel big event late November combined one January Australian Open mean much tennis little rest .
- Score: 0.0091, Sentence: The Spaniard broke Anderson twice second n't get another chance South African 's serve final set .
- Score: 0.0091, Sentence: `` I felt like best weeks I get know players I playing Fed Cup weeks Olympic weeks , necessarily tournaments .
- Score: 0.0091, Sentence: Federer easier time previous match Medvedev , three-setter Shanghai two weeks ago .
- Score: 0.0090, Sentence: When I 'm courts I