https://towardsdatascience.com/create-a-simple-search-engine-using-python-412587619ff5

## Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from natsort import natsorted
from nltk.stem import PorterStemmer
# import re
# import string
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd
# import numpy as np


## Read files

In [105]:
documents = []
for file in os.listdir('Articles/'):
    if 'txt' in file:
        with open('Articles/'+file, 'r', encoding='latin1') as f:
            documents.append(f.read())

In [106]:
len(documents)

10

# First Phase $:-$

## Apply tokenization

In [107]:
token_docs = []
for document in documents:
    token_docs.append(word_tokenize(document))

## Stop words

In [108]:
stop_words = stopwords.words('english')

#### Remove in , to from stop words
#### Add some extra punctuation

In [109]:
stop_words.remove('in')
stop_words.remove('to')

stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&"])

In [110]:
documents = []
for token in token_docs:
    each_token = []
    for term in token:
        if term not in stop_words:
            each_token.append(term)
    documents.append(each_token)

In [111]:
documents[0][:10]

['Stephen',
 'Curry',
 'scored',
 '24',
 'points',
 'Golden',
 'State',
 'Warriors',
 'continued',
 'fine']

# Second phase $:-$

### Implement function to do all steps in first phase

In [4]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    stop_words = stopwords.words('english')
    stop_words.remove('in')
    stop_words.remove('to')
    stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&", "/", "\\"])
    prepared_doc = []
    for term in token_docs:
        if term not in stop_words:
            prepared_doc.append(term)
    return prepared_doc


In [15]:
# Initialize the stemmer.
stemmer = PorterStemmer()
 
# Initialize the file no.
fileno = 0
 
# Initialize the dictionary.
pos_index = {}
 
# Initialize the file mapping (fileno -> file name).
file_map = {}

In [16]:
# Open files.
file_names = natsorted(os.listdir("Articles"))
print(file_names)
# For every file.
for file_name in file_names:

    # Read file contents.
    with open(f'Articles/{file_name}', 'r', encoding='latin1') as f:
        stuff = f.read()
    # This is the list of words in order of the text.
    # We need to preserve the order because we require positions.
    # 'preprocessing' function does some basic punctuation removal,
    # stopword removal etc.
    final_token_list = preprocessing(stuff)

    # For position and term in the tokens.
    for pos, term in enumerate(final_token_list):
        # print(pos, '-->' ,term)
        # First stem the term.
        term = stemmer.stem(term)
        # print(term)
        # If term already exists in the positional index dictionary.
        if term in pos_index:
                
            # Increment total freq by 1.
            pos_index[term][0] = pos_index[term][0] + 1
                
            # Check if the term has existed in that DocID before.
            if fileno in pos_index[term][1]:
                pos_index[term][1][fileno].append(pos)
                    
            else:
                pos_index[term][1][fileno] = [pos]

        # If term does not exist in the positional index dictionary
        # (first encounter).
        else:
                
            # Initialize the list.
            pos_index[term] = []
            # The total frequency is 1.
            pos_index[term].append(1)
            # The postings list is initially empty.
            pos_index[term].append({})     
            # Add doc ID to postings list.
            pos_index[term][1][fileno] = [pos]

    # Map the file no. to the file name.
    file_map[fileno] = "test/" + file_name

    # Increment the file no. counter for document ID mapping             
    fileno += 1

['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt', 'doc6.txt', 'doc7.txt', 'doc8.txt', 'doc9.txt', 'doc10.txt']


### displays each term 

In [20]:
pos_index

{'stephen': [1, {0: [0]}],
 'curri': [2, {0: [1, 18]}],
 'score': [21,
  {0: [2, 28, 67, 128],
   1: [354, 506, 511, 553, 598, 782, 965],
   3: [105],
   5: [1126],
   6: [406, 456, 556, 671],
   7: [58, 136, 811, 814]}],
 '24': [2, {0: [3], 7: [956]}],
 'point': [10,
  {0: [4, 35, 130], 5: [1106, 1622], 7: [158, 184], 8: [334, 502], 9: [108]}],
 'golden': [2, {0: [5], 5: [1478]}],
 'state': [8, {0: [6], 2: [195, 473], 4: [79], 6: [64, 449, 490], 7: [221]}],
 'warrior': [2, {0: [7, 41]}],
 'continu': [7,
  {0: [8], 1: [751], 2: [427], 5: [1425], 6: [385], 7: [636, 840]}],
 'fine': [2, {0: [9], 3: [341]}],
 'form': [4, {0: [10], 5: [461], 7: [479, 857]}],
 'home': [20,
  {0: [11, 43, 133],
   1: [228, 1177],
   2: [58, 69, 78, 280],
   5: [249, 1166, 1701],
   6: [283, 681, 746],
   7: [39],
   8: [28, 717],
   9: [118, 995]}],
 '111-101': [1, {0: [12]}],
 'victori': [9,
  {0: [13, 59, 134], 2: [231], 5: [669, 868], 6: [707], 7: [127], 8: [782]}],
 'new': [18,
  {0: [14, 60],
   1: [408

### Allow users to write phrase query 

In [17]:
test_term = 'sport'
test_pos_index = pos_index[test_term]
print('term is', test_term, 'and the count is', test_pos_index[0])
for doc in test_pos_index[1]:
    print(doc, ':', test_pos_index[1][doc])

term is sport and the count is 19
1 : [44, 72]
2 : [94, 158, 461]
3 : [54, 262]
5 : [7]
6 : [256, 319]
7 : [89, 106]
8 : [201, 649, 878]
9 : [387, 480, 1167, 1179]
