https://towardsdatascience.com/create-a-simple-search-engine-using-python-412587619ff5

## Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from natsort import natsorted
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np

## Read files

In [2]:
def read_files(file):
    if 'txt' in file:
        with open(f'Articles/'+file, 'r', encoding='latin1') as f:
            return f.read()

In [3]:
documents = []
for file in os.listdir('Articles'):
    documents.append(read_files(file))

In [4]:
len(documents)

10

# First Phase $:-$

## Apply tokenization

In [5]:
token_docs = []
for document in documents:
    token_docs.append(word_tokenize(document))

## Stop words

In [6]:
stop_words = stopwords.words('english')

#### Remove in , to from stop words
#### Add some extra punctuation

In [7]:
stop_words.remove('in')
stop_words.remove('to')

stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&"])

In [8]:
documents = []
for token in token_docs:
    each_token = []
    for term in token:
        if term not in stop_words:
            each_token.append(term)
    documents.append(each_token)

In [9]:
documents[0][:10]

['Stephen',
 'Curry',
 'scored',
 '24',
 'points',
 'Golden',
 'State',
 'Warriors',
 'continued',
 'fine']

# Second phase $:-$

### Implement function to do all steps in first phase

In [10]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    stop_words = stopwords.words('english')
    stop_words.remove('in')
    stop_words.remove('to')
    stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&", "/", "\\", "]", "[", "''", '""', "' '", '" "'])
    prepared_doc = []
    for term in token_docs:
        if term not in stop_words:
            prepared_doc.append(term)
    return prepared_doc


In [11]:
# Initialize the stemmer.
stemmer = PorterStemmer()
 
# Initialize the file no.
fileno = 0
 
# Initialize the dictionary.
pos_index = {}
 
# Initialize the file mapping (fileno -> file name).
file_map = {}

In [12]:
# Open files.
file_names = natsorted(os.listdir("Articles"))
print(file_names)
# For every file.
for file_name in file_names:

    # Read file contents.
    with open(f'Articles/{file_name}', 'r', encoding='latin1') as f:
        stuff = f.read()
    # This is the list of words in order of the text.
    # We need to preserve the order because we require positions.
    # 'preprocessing' function does some basic punctuation removal,
    # stopword removal etc.
    final_token_list = preprocessing(stuff)

    # For position and term in the tokens.
    for pos, term in enumerate(final_token_list):
        # print(pos, '-->' ,term)
        # First stem the term.
        term = stemmer.stem(term)
        # print(term)
        # If term already exists in the positional index dictionary.
        if term in pos_index:
                
            # Increment total freq by 1.
            pos_index[term][0] = pos_index[term][0] + 1
                
            # Check if the term has existed in that DocID before.
            if fileno in pos_index[term][1]:
                pos_index[term][1][fileno].append(pos)
                    
            else:
                pos_index[term][1][fileno] = [pos]

        # If term does not exist in the positional index dictionary
        # (first encounter).
        else:
                
            # Initialize the list.
            pos_index[term] = []
            # The total frequency is 1.
            pos_index[term].append(1)
            # The postings list is initially empty.
            pos_index[term].append({})     
            # Add doc ID to postings list.
            pos_index[term][1][fileno] = [pos]

    # Map the file no. to the file name.
    file_map[fileno] = "test/" + file_name

    # Increment the file no. counter for document ID mapping             
    fileno += 1

['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt', 'doc6.txt', 'doc7.txt', 'doc8.txt', 'doc9.txt', 'doc10.txt']


### displays each term 

In [13]:
pos_index

{'stephen': [1, {0: [0]}],
 'curri': [2, {0: [1, 18]}],
 'score': [21,
  {0: [2, 28, 67, 128],
   1: [353, 505, 510, 552, 597, 781, 964],
   3: [105],
   5: [1107],
   6: [403, 453, 553, 668],
   7: [58, 136, 800, 803]}],
 '24': [2, {0: [3], 7: [942]}],
 'point': [10,
  {0: [4, 35, 130], 5: [1087, 1594], 7: [158, 184], 8: [327, 489], 9: [108]}],
 'golden': [2, {0: [5], 5: [1453]}],
 'state': [8, {0: [6], 2: [189, 462], 4: [79], 6: [64, 446, 487], 7: [220]}],
 'warrior': [2, {0: [7, 41]}],
 'continu': [7,
  {0: [8], 1: [750], 2: [416], 5: [1400], 6: [382], 7: [627, 828]}],
 'fine': [2, {0: [9], 3: [334]}],
 'form': [4, {0: [10], 5: [452], 7: [472, 845]}],
 'home': [20,
  {0: [11, 43, 133],
   1: [228, 1176],
   2: [58, 67, 76, 273],
   5: [245, 1147, 1672],
   6: [281, 678, 743],
   7: [39],
   8: [28, 696],
   9: [118, 983]}],
 '111-101': [1, {0: [12]}],
 'victori': [9,
  {0: [13, 59, 134], 2: [225], 5: [659, 853], 6: [704], 7: [127], 8: [757]}],
 'new': [18,
  {0: [14, 60],
   1: [407

### Allow users to write phrase query 

In [16]:
test_term = input()
test_pos_index = pos_index[test_term]
print('term', test_term, 'and the count is', test_pos_index[0])
for doc in test_pos_index[1]:
    print(doc, ':', test_pos_index[1][doc])

term sport and the count is 19
1 : [44, 72]
2 : [91, 152, 450]
3 : [54, 258]
5 : [7]
6 : [254, 316]
7 : [89, 106]
8 : [196, 634, 853]
9 : [381, 472, 1152, 1162]


# Third phase $:-$

In [35]:
documents = []
for file in os.listdir('Articles'):
    documents.append(" ".join(preprocessing(read_files(file))))

In [46]:
# Instantiate a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# It fits the data and transform it as a vector
X = vectorizer.fit_transform(documents)
# Convert the X as transposed matrix
X = X.T.toarray()
# Create a DataFrame and set the vocabulary as the index
df = pd.DataFrame(X, index=vectorizer.get_feature_names_out())

In [47]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
00,0.000000,0.000000,0.000000,0.0,0.047634,0.0,0.000000,0.0,0.000000,0.000000
000,0.000000,0.014215,0.000000,0.0,0.000000,0.0,0.009581,0.0,0.033897,0.000000
02,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.022788,0.000000
10,0.095845,0.045398,0.046976,0.0,0.000000,0.0,0.007650,0.0,0.013532,0.000000
100,0.000000,0.000000,0.016812,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.017453
...,...,...,...,...,...,...,...,...,...,...
youngsters,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.012882,0.0,0.000000,0.000000
youth,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.012882,0.0,0.000000,0.000000
zealand,0.000000,0.000000,0.059331,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000
zeist,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.012882,0.0,0.000000,0.000000


In [48]:
def get_similar_articles(q, df):
    print("query:", q)
    print("The articles with the highest cosine similarity values: ")  # Convert the query become a vector
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}  # Calculate the similarity
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
    
    # Sort the values 
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)  # Print the articles and their similarity values
    for k, v in sim_sorted:
        if v != 0.0:
            print("Nilai Similaritas:", v)
            print(documents[k])
            print()# Add The Query

In [49]:
q1 = 'Real Madrid the best club'# Call the function
get_similar_articles(q1, df)

query: Real Madrid the best club
The articles with the highest cosine similarity values: 
Nilai Similaritas: 0.11818219197208413
France 's Kylian Mbappe breakout star 2018 World Cup make name Qatar 2022 ? The 22nd edition tournament features players ever total 832 across 32 teams Among many established global superstars plenty less familiar faces looking to shine football 's biggest stage Here BBC Sport 's TV radio World Cup commentators Guy Mowbray John Murray Vicki Sparks pick 10 players outside Premier League worth watching winter Who win World Cup ? BBC Sport pundits make predictions Check full World Cup schedule Day-by-day fixture TV guide 1 Daichi Kamada Japan Age 26 Position Midfield Club Eintracht Frankfurt Germany Japan 's Daichi Kamada Kamada dubbed 'the Liberator Arsenal in cheeky Wikipedia edit two goals Frankfurt Emirates Stadium in 2019 spelt end Unai Emery 's stint Gunners boss The entry changed soon afterwards performance nickname recalled whenever linked move to Englan