https://towardsdatascience.com/create-a-simple-search-engine-using-python-412587619ff5

## Libraries

In [101]:
import requests
from bs4 import BeautifulSoup
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from natsort import natsorted
from nltk.stem import PorterStemmer
# import re
# import string
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd
# import numpy as np


## Scrap documents

In [2]:
# Make a request to the website
r = requests.get('https://bola.kompas.com/')
# Create an object to parse the HTML format
soup = BeautifulSoup(r.content, 'html.parser')
# Retrieve all popular news links (Fig. 1)
link = []
for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
    i['href'] = i['href'] + '?page=all'
    link.append(i['href'])

In [4]:
# For each link, we retrieve paragraphs from it, combine each paragraph as one string, and save it to documents (Fig. 2)
documents = []
for i in link:
    # Make a request to the link
    r = requests.get(i)
  
    # Initialize BeautifulSoup object to parse the content 
    soup = BeautifulSoup(r.content, 'html.parser')
  
    # Retrieve all paragraphs and combine it as one
    sen = []
    for i in soup.find('div', {'class':'read__content'}).find_all('p'):
        sen.append(i.text)

    # Add the combined paragraphs to documents
    documents.append(' '.join(sen))

## Save files

In [9]:
for i in range(len(documents)):
    with open(f'document_{i+1}.txt', 'a', encoding="utf-8") as f:
        f.write(documents[i])
        f.close

## Read files

In [27]:
documents = []
for file in os.listdir():
    if 'txt' in file:
        with open(file, 'r', encoding='latin1') as f:
            documents.append(f.read())

# First Phase $:-$

## Apply tokenization

In [33]:
token_docs = []
for document in documents:
    token_docs.append(word_tokenize(document))

## Stop words

In [95]:
stop_words = stopwords.words('english')

#### Remove in , to from stop words
#### Add some extra punctuation

In [82]:
stop_words.remove('in')
stop_words.remove('to')

stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&"])

In [93]:
documents = []
for token in token_docs:
    each_token = []
    for term in token:
        if term not in stop_words:
            each_token.append(term)
    documents.append(each_token)

# Second phase $:-$

### Implement function to do all steps in first phase

In [102]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    stop_words = stopwords.words('english')
    stop_words.remove('in')
    stop_words.remove('to')
    stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&"])
    prepared_doc = []
    for term in token:
        if term not in stop_words:
            prepared_doc.append(term)
    return prepared_doc


In [103]:
# Initialize the stemmer.
stemmer = PorterStemmer()
 
# Initialize the file no.
fileno = 0
 
# Initialize the dictionary.
pos_index = {}
 
# Initialize the file mapping (fileno -> file name).
file_map = {}

In [108]:
# Open files.
file_names = natsorted(os.listdir("Articles"))

# For every file.
for file_name in file_names:

    # Read file contents.
    with open(f'Articles/{file_name}', 'r', encoding='latin1') as f:
        stuff = f.read()
    # This is the list of words in order of the text.
    # We need to preserve the order because we require positions.
    # 'preprocessing' function does some basic punctuation removal,
    # stopword removal etc.
    final_token_list = preprocessing(stuff)

    # For position and term in the tokens.
    for pos, term in enumerate(final_token_list):
            
        # First stem the term.
        term = stemmer.stem(term)

        # If term already exists in the positional index dictionary.
        if term in pos_index:
                
            # Increment total freq by 1.
            pos_index[term][0] = pos_index[term][0] + 1
                
            # Check if the term has existed in that DocID before.
            if fileno in pos_index[term][1]:
                pos_index[term][1][fileno].append(pos)
                    
            else:
                pos_index[term][1][fileno] = [pos]

        # If term does not exist in the positional index dictionary
        # (first encounter).
        else:
                
            # Initialize the list.
            pos_index[term] = []
            # The total frequency is 1.
            pos_index[term].append(1)
            # The postings list is initially empty.
            pos_index[term].append({})     
            # Add doc ID to postings list.
            pos_index[term][1][fileno] = [pos]

    # Map the file no. to the file name.
    file_map[fileno] = "Articles/" + file_name

    # Increment the file no. counter for document ID mapping             
    fileno += 1

In [118]:
# Sample positional index to test the code.
sample_pos_idx = pos_index["timna"]
print("Positional Index")
print(sample_pos_idx)

Positional Index
[380, {0: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 1: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 2: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 3: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 4: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 5: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 6: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 7: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 8: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 9: [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646], 10: [1, 23, 37, 49, 65, 112, 150, 20

In [123]:
file_list = sample_pos_idx[1]
print("\tFilename,\t\t\t\t\t\t [Positions]")
for fileno, positions in file_list.items():
    print(file_map[fileno], positions)

	Filename,						 [Positions]
Articles/document_1.txt [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646]
Articles/document_2.txt [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646]
Articles/document_3.txt [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646]
Articles/document_4.txt [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646]
Articles/document_5.txt [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646]
Articles/document_6.txt [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646]
Articles/document_7.txt [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646]
Articles/document_8.txt [1, 23, 37, 49, 65, 112, 150, 207, 250, 280, 298, 316, 435, 496, 566, 576, 606, 622, 646]
Articles/document_9.txt [1, 23, 37, 49, 65, 112, 150, 207, 