https://towardsdatascience.com/create-a-simple-search-engine-using-python-412587619ff5

## Libraries

In [41]:
import requests
from bs4 import BeautifulSoup
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# import re
# import string
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd
# import numpy as np


## Scrap documents

In [2]:
# Make a request to the website
r = requests.get('https://bola.kompas.com/')
# Create an object to parse the HTML format
soup = BeautifulSoup(r.content, 'html.parser')
# Retrieve all popular news links (Fig. 1)
link = []
for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
    i['href'] = i['href'] + '?page=all'
    link.append(i['href'])

In [4]:
# For each link, we retrieve paragraphs from it, combine each paragraph as one string, and save it to documents (Fig. 2)
documents = []
for i in link:
    # Make a request to the link
    r = requests.get(i)
  
    # Initialize BeautifulSoup object to parse the content 
    soup = BeautifulSoup(r.content, 'html.parser')
  
    # Retrieve all paragraphs and combine it as one
    sen = []
    for i in soup.find('div', {'class':'read__content'}).find_all('p'):
        sen.append(i.text)

    # Add the combined paragraphs to documents
    documents.append(' '.join(sen))

## Save files

In [9]:
for i in range(len(documents)):
    with open(f'document_{i+1}.txt', 'a', encoding="utf-8") as f:
        f.write(documents[i])
        f.close

## Read files

In [27]:
documents = []
for file in os.listdir():
    if 'txt' in file:
        with open(file, 'r', encoding='latin1') as f:
            documents.append(f.read())

In [33]:
token_docs = []
for document in documents:
    token_docs.append(word_tokenize(document))

In [81]:
stop_words = stopwords.words('english')

In [82]:
stop_words.remove('in')
stop_words.remove('to')

stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&"])

In [93]:
documents = []
for token in token_docs:
    each_token = []
    for term in token:
        if term not in stop_words:
            each_token.append(term)
    documents.append(each_token)

In [5]:
documents_clean = []
for d in documents:
    # Remove Unicode
    document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
    # Remove Mentions
    document_test = re.sub(r'@\w+', '', document_test)
    # Lowercase the document
    document_test = document_test.lower()
    # Remove punctuations
    document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
    # Lowercase the numbers
    document_test = re.sub(r'[0-9]', '', document_test)
    # Remove the doubled space
    document_test = re.sub(r'\s{2,}', ' ', document_test)
    documents_clean.append(document_test)

In [18]:
# Instantiate a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# It fits the data and transform it as a vector
X = vectorizer.fit_transform(documents_clean)
# Convert the X as transposed matrix
X = X.T.toarray()
# Create a DataFrame and set the vocabulary as the index
df = pd.DataFrame(X, index=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
aaron,0.026937,0.0,0.0,0.256663,0.0,0.0,0.0,0.032908,0.0,0.0
abu,0.0,0.0,0.0,0.0,0.0,0.0,0.102258,0.0,0.0,0.0
ac,0.0,0.0,0.0,0.0,0.234746,0.0,0.0,0.0,0.0,0.0
ada,0.0,0.034354,0.0,0.032599,0.0,0.101841,0.033808,0.0,0.0,0.0
adalah,0.0,0.027891,0.027162,0.026466,0.0,0.02067,0.027448,0.0,0.033925,0.0


In [22]:
def get_similar_articles(q, df):
  print("query:", q)
  print("Berikut artikel dengan nilai cosine similarity tertinggi: ")
  # Convert the query become a vector
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  # print('q_vec', q_vec)
  sim = {}
  # Calculate the similarity
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  # Sort the values 
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  # Print the articles and their similarity values
  for k, v in sim_sorted:
    # print(k, '----', v)
    if v != 0.0:
      print("Nilai Similaritas:", v)
      print(documents_clean[k])
      print()

In [23]:
# Add The Query
q1 = 'Real'
# Call the function
get_similar_articles(q1, df)

query: Real
Berikut artikel dengan nilai cosine similarity tertinggi: 
q_vec [0. 0. 0. ... 0. 0. 0.]
