# Create A Simple Search Engine Using Python 
## Utilize TF-IDF and Cosine Similarity to retrieve similar articles with query

Information Retrieval right now is an important task. Probably you're wondering, how does the system can retrieve articles that we want using a query? Here are the steps,
1. Extract documents from the Internet (It could be Web Scraping or extract manually)
2. Clean the documents to make the retrieval much easier
3. Create a Term-Document Matrix with TF-IDF weighting
4. Write your queries and convert it as vector (based on TF-IDF)
5. Calculate the cosine similarity between the query and the document and repeat the process on each document.
6. Finally, show the document


In [1]:
import requests
from bs4 import BeautifulSoup

In [4]:
import re
import string

In [5]:
def retrieve_docs_and_clean():
    r = requests.get('https://sports.ndtv.com/fifa-world-cup-2022/news')
    soup = BeautifulSoup(r.content, 'html.parser')
    #THE FOLLOWING CODE NEED TO BE MODIFIED TO SUITE FOR THE ABOVE URL
    link = []
    for i in soup.find_all('a',attrs={'href':re.compile('^https')}):
        i['href'] = i['href'] + '?page=all'
        link.append(i['href'])
    # Retrieve Paragraphs
    documents = []
    for i in link:
        r = requests.get(i)
        soup = BeautifulSoup(r.content, 'html.parser')
        sen = []
        for i in soup.find_all('p'):
            sen.append(i.text)
        documents.append(' '.join(sen))
    # Clean Paragraphs
    documents_clean = []
    for d in documents:
        document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
        document_test = re.sub(r'@\w+', '', document_test)
        document_test = document_test.lower()
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
        document_test = re.sub(r'[0-9]', '', document_test)
        document_test = re.sub(r'\s{2,}', ' ', document_test)
        documents_clean.append(document_test)
    return documents_clean    

In [8]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
docs = retrieve_docs_and_clean()

# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
print(df.head())
print(df.shape)

             0         1    2    3    4    5    6    7    8    9   ...   64  \
aaron  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
aasif  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
aayan  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
aaye   0.032442  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
aayi   0.029597  0.015279  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   

        65   66   67   68   69   70   71   72   73  
aaron  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aasif  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aayan  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aaye   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aayi   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 74 columns]
(3075, 74)


In [11]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aasif,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aayan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaye,0.03244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aayi,0.029596,0.015254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
def get_similar_articles(q, df):
    print("query:", q)
    print("The following are articles with the highest cosine similarity values: ")
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
    for k, v in sim_sorted:
        if v != 0.0:
            print("Similarity Values:", v)
            print(docs[k])
            print()

In [18]:
q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
The following are articles with the highest cosine similarity values: 
Similarity Values: nan
 

----------------------------------------------------------------------------------------------------
query: spain
The following are articles with the highest cosine similarity values: 
Similarity Values: nan
 

----------------------------------------------------------------------------------------------------
query: argentina
The following are articles with the highest cosine similarity values: 
Similarity Values: nan
 



  sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)


# BM25

In [13]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [14]:
from rank_bm25 import BM25Okapi

In [15]:
tokenized_corpus = [doc.split(" ") for doc in docs]
bm25 = BM25Okapi(tokenized_corpus)
def rank_bm25(docs,query):
    print("query:", query)
    print("The following are articles with the highest cosine similarity values: ")
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    rang={}
    for i in range (len(doc_scores)):
        if doc_scores[i]!=0.:
            rang[i]=doc_scores[i]
    rang_sorted = sorted(rang.items(), key=lambda x: x[1], reverse=True)
    for i,j in rang_sorted[:3]:
        print("Similarity Values:", j)
        print(docs[i])
        print()
    print('-'*100)

In [16]:
q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'
rank_bm25(docs,q1)
rank_bm25(docs,q2)
rank_bm25(docs,q3)

query: barcelona
The following are articles with the highest cosine similarity values: 
----------------------------------------------------------------------------------------------------
query: spain
The following are articles with the highest cosine similarity values: 
Similarity Values: 3.583882837776834
a record six asian teams will attempt against the odds to emulate south korea s historic run to the semi finals as the world cup returns to the continent in qatar no asian team has managed to match the stunning achievements years ago of guus hiddink s vibrant korean side who reached the last four on home soil after dumping out portugal italy and spain hosts qatar japan south korea saudi arabia iran and australia all from the asian football confederation will dream in the coming weeks of making a similar impact but they have their work cut out the koreans have only reached the knock out round once since and their chances have been dealt a blow by a facial injury to striker son heung