# Create A Simple Search Engine Using Python 
## Utilize TF-IDF and Cosine Similarity to retrieve similar articles with query

Information Retrieval right now is an important task. Probably you're wondering, how does the system can retrieve articles that we want using a query? Here are the steps,
1. Extract documents from the Internet (It could be Web Scraping or extract manually)
2. Clean the documents to make the retrieval much easier
3. Create a Term-Document Matrix with TF-IDF weighting
4. Write your queries and convert it as vector (based on TF-IDF)
5. Calculate the cosine similarity between the query and the document and repeat the process on each document.
6. Finally, show the document


In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
import re
import string

In [4]:
def retrieve_docs_and_clean():
    r = requests.get('https://sports.ndtv.com/fifa-world-cup-2022/news')
    soup = BeautifulSoup(r.content, 'html.parser')
    #THE FOLLOWING CODE NEED TO BE MODIFIED TO SUITE FOR THE ABOVE URL
    link = []
    for i in soup.find('div', {'class':'lst-pg_hd'}).find_all('a',{'class':'lst-pg_ttl'}):
        i['href'] ='https://sports.ndtv.com/'+ i['href'] + '?page=all'
        link.append(i['href'])
    # Retrieve Paragraphs
    documents = []
    for i in link:
        r = requests.get(i)
        soup = BeautifulSoup(r.content, 'html.parser')
        sen = []
        for i in soup.find_all('p'):
            sen.append(i.text)
        documents.append(' '.join(sen))
    # Clean Paragraphs
    documents_clean = []
    for d in documents:
        document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
        document_test = re.sub(r'@\w+', '', document_test)
        document_test = document_test.lower()
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
        document_test = re.sub(r'[0-9]', '', document_test)
        document_test = re.sub(r'\s{2,}', ' ', document_test)
        documents_clean.append(document_test)
    return documents_clean    

In [5]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
docs = retrieve_docs_and_clean()

# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
print(df.head())
print(df.shape)

                 0         1    2    3         4    5    6    7         8   \
abandoned  0.000000  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0  0.000000   
ability    0.085854  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0  0.000000   
about      0.052833  0.020413  0.0  0.0  0.000000  0.0  0.0  0.0  0.056845   
above      0.000000  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0  0.000000   
absent     0.000000  0.000000  0.0  0.0  0.002453  0.0  0.0  0.0  0.000000   

                 9         10        11   12   13   14        15   16   17  
abandoned  0.000000  0.000000  0.000000  0.0  0.0  0.0  0.103158  0.0  0.0  
ability    0.065939  0.000000  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  
about      0.000000  0.054923  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  
above      0.000000  0.000000  0.026193  0.0  0.0  0.0  0.000000  0.0  0.0  
absent     0.000000  0.000000  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  
(1734, 18)


In [7]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
abandoned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103158,0.0,0.0
ability,0.085854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
about,0.052833,0.020413,0.0,0.0,0.0,0.0,0.0,0.0,0.056845,0.0,0.054923,0.0,0.0,0.0,0.0,0.0,0.0,0.0
above,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026193,0.0,0.0,0.0,0.0,0.0,0.0
absent,0.0,0.0,0.0,0.0,0.002453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
def get_similar_articles(q, df):
    print("query:", q)
    print("The following are articles with the highest cosine similarity values: ")
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
    for k, v in sim_sorted:
        if v != 0.0:
            print("Similarity Values:", v)
            print(docs[k])
            print()

In [9]:
q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
The following are articles with the highest cosine similarity values: 
Similarity Values: 0.004397362878104314
poland captain robert lewandowski refused to confirm if he had played his last ever game at the world cup after his side were knocked out of the tournament in qatar in a last defeat by france on sunday barcelona striker lewandowski scored a late consolation from the penalty spot for a poland side who were outclassed by the fearsome french attack in doha he will be almost by the time the next world cup comes around in north america in but he suggested that issues beyond his physical condition were more likely to see him end his international career physically i m not afraid of this but we have so many different things outside of football whether your happiness is still there and what s going on around so it s tough to say now admitted the former bayern munich striker from the sporting side i m not afraid but there are different things that altogether can decide

# BM25

In [13]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [10]:
from rank_bm25 import BM25Okapi

In [11]:
tokenized_corpus = [doc.split(" ") for doc in docs]
bm25 = BM25Okapi(tokenized_corpus)
def rank_bm25(docs,query):
    print("query:", query)
    print("The following are articles with the highest cosine similarity values: ")
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    rang={}
    for i in range (len(doc_scores)):
        if doc_scores[i]!=0.:
            rang[i]=doc_scores[i]
    rang_sorted = sorted(rang.items(), key=lambda x: x[1], reverse=True)
    for i,j in rang_sorted[:3]:
        print("Similarity Values:", j)
        print(docs[i])
        print()
    print('-'*100)

In [12]:
q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'
rank_bm25(docs,q1)
rank_bm25(docs,q2)
rank_bm25(docs,q3)

query: barcelona
The following are articles with the highest cosine similarity values: 
Similarity Values: 2.541120194103686
poland captain robert lewandowski refused to confirm if he had played his last ever game at the world cup after his side were knocked out of the tournament in qatar in a last defeat by france on sunday barcelona striker lewandowski scored a late consolation from the penalty spot for a poland side who were outclassed by the fearsome french attack in doha he will be almost by the time the next world cup comes around in north america in but he suggested that issues beyond his physical condition were more likely to see him end his international career physically i m not afraid of this but we have so many different things outside of football whether your happiness is still there and what s going on around so it s tough to say now admitted the former bayern munich striker from the sporting side i m not afraid but there are different things that altogether can decide wh