In [None]:
import re
import string 
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.summarization.bm25 import BM25

In [None]:
def retrieve_docs_and_clean():
  r = requests.get('https://sports.ndtv.com/fifa-world-cup-2022/news')
  soup = BeautifulSoup(r.content, 'html.parser')

  link = []
  for i in soup.find('div', {'class': 'lst-pg_hd'}).find_all('a',{'class':'lst-pg_ttl'}):
    i['href'] = 'https://sports.ndtv.com/' + i['href'] + '?page=all'
    link.append(i['href'])

  # Retrieving the paragraphs
  documents = []
  for i in link:
    r = requests.get(i)
    soup = BeautifulSoup(r.content, 'html.parser')

    sen = []
    for i in soup.find('div', {'class':'sp-cn pg-str-com js-ad-section'}).find_all('p'):
      sen.append(i.text)
    documents.append(' '.join(sen))

  # Cleaning the paragraphs
  documents_clean = []
  for d in documents:
    document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
    document_test = re.sub(r'@w+', '', document_test)
    document_test = document_test.lower()
    document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
    document_test = re.sub(r'[0-9]', '', document_test)
    document_test = re.sub(r'\s{2,}', ' ', document_test)
    documents_clean.append(document_test)

  return documents_clean


In [None]:
docs = retrieve_docs_and_clean()
# Creating Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Creating a DataFrame
df = pd.DataFrame(X.T.toarray(), index = vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
abandoned,0.0,0.0,0.0,0.0,0.091826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
about,0.068687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
above,0.0,0.022844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ac,0.0,0.0,0.0,0.049312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
according,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053263,0.081978,0.0,0.0,0.0


In [None]:
df.shape

(1572, 18)

## ***Cosine Similarity***

In [None]:
def get_similar_articles(q, df):
  print("Query:", q)
  print("The following are articles with the highest cosine similarity")
  print()
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)

  sim_sorted = sorted(sim.items(), key = lambda x: x[1], reverse = True)

  for k, v in sim_sorted:
    if v != 0.0:
      print("Cosine Similarity Value:", v)
      print(docs[k])
      print()

q1 = 'messi'
q2 = 'qatar'
q3 = 'argentina'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)
print('-'*100)

Query: messi
The following are articles with the highest cosine similarity

Cosine Similarity Value: 0.1403526492196313
lionel messi finally scored a goal in the knockout rounds of the world cup on saturday as he inspired argentina to a win over australia that sets up a mouthwatering quarter final showdown with the netherlands who proved too strong for the united states earlier the argentina captain marked his th career appearance with his th goal to open the scoring in the first half at doha s ahmad bin ali stadium it was a classy finish from a player appearing at his fifth world cup but who had never previously found the net in a knockout tie at the tournament he is looking to win for the first time at the age of window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai src v location protocol o d head appendchild ai window document a vdo ai core v ndtv vdo ai js that reflex god level messipic twitter com eynsqvqw stev

### ***BM25***

In [None]:
def simple_tok(sent:str):
  return sent.split()
def similar_articles_bm25(query):
  print("Query:", query)
  print("The following are the articles with the highest BM25 scores corresponding to the query: ")
  print()
  tok_corpus = [simple_tok(s) for s in docs]
  query = simple_tok(query)
  bm25 = BM25(tok_corpus)
  scores = bm25.get_scores(query, average_idf = 100)
  best_docs = sorted(range(len(scores)), key = lambda i: scores[i], reverse = True)[:10]
  for i, b in enumerate(best_docs):
    print(f"rank {i+1}: {docs[b]}")
    print()


q1 = 'messi'
q2 = 'qatar'
q3 = 'argentina'

similar_articles_bm25(q1)
print('-'*100)
similar_articles_bm25(q2)
print('-'*100)
similar_articles_bm25(q3)
print('-'*100)
 

Query: messi
The following are the articles with the highest BM25 scores corresponding to the query: 

rank 1: lionel messi produced a moment of trademark quality to score the opener in a win over australia and send argentina into the quarter finals of the world cup on saturday messi s th goal in his th career appearance but first in the knockout rounds of the world cup helped set up a last eight clash with the unbeaten netherlands on friday julian alvarez scored the other in the th minute after some calamitous australian defending before enzo fernandez s th minute own goal set up an unexpectedly nervy finale now one of the best players of all time is seeking to win the tournament for the first time in what will surely be his final world cup window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai src v location protocol o d head appendchild ai window document a vdo ai core v ndtv vdo ai js messi s classy first half goa