## Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from natsort import natsorted
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np

## Read files

In [2]:
def read_files(file):
    if 'txt' in file:
        with open(f'Articles/'+file, 'r', encoding='latin1') as f:
            return f.read()

In [3]:
documents = []
for file in os.listdir('Articles'):
    documents.append(read_files(file))

In [4]:
len(documents)

10

# First Phase $:-$

## Apply tokenization

In [5]:
token_docs = []
for document in documents:
    token_docs.append(word_tokenize(document))

## Stop words

In [6]:
stop_words = stopwords.words('english')

#### Remove in , to from stop words
#### Add some extra punctuation

In [7]:
stop_words.remove('in')
stop_words.remove('to')

stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&"])

In [8]:
documents = []
for token in token_docs:
    each_token = []
    for term in token:
        if term not in stop_words:
            each_token.append(term)
    documents.append(each_token)

In [9]:
documents[0][:10]

['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser']

# Second phase $:-$

### Implement function to do all steps in first phase

In [10]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    stop_words = stopwords.words('english')
    stop_words.remove('in')
    stop_words.remove('to')
    stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&", "/", "\\", "]", "[", "''", '""', "' '", '" "'])
    prepared_doc = []
    for term in token_docs:
        if term not in stop_words:
            prepared_doc.append(term)
    return prepared_doc


In [11]:
# Initialize the stemmer.
stemmer = PorterStemmer()
 
# Initialize the file no.
fileno = 0
 
# Initialize the dictionary.
pos_index = {}
 
# Initialize the file mapping (fileno -> file name).
file_map = {}

In [12]:
# Open files.
file_names = natsorted(os.listdir("Articles"))
print(file_names)
# For every file.
for file_name in file_names:

    # Read file contents.
    with open(f'Articles/{file_name}', 'r', encoding='latin1') as f:
        stuff = f.read()
    # This is the list of words in order of the text.
    # We need to preserve the order because we require positions.
    # 'preprocessing' function does some basic punctuation removal,
    # stopword removal etc.
    final_token_list = preprocessing(stuff)

    # For position and term in the tokens.
    for pos, term in enumerate(final_token_list):
        # print(pos, '-->' ,term)
        # First stem the term.
        term = stemmer.stem(term)
        # print(term)
        # If term already exists in the positional index dictionary.
        if term in pos_index:
                
            # Increment total freq by 1.
            pos_index[term][0] = pos_index[term][0] + 1
                
            # Check if the term has existed in that DocID before.
            if fileno in pos_index[term][1]:
                pos_index[term][1][fileno].append(pos)
                    
            else:
                pos_index[term][1][fileno] = [pos]

        # If term does not exist in the positional index dictionary
        # (first encounter).
        else:
                
            # Initialize the list.
            pos_index[term] = []
            # The total frequency is 1.
            pos_index[term].append(1)
            # The postings list is initially empty.
            pos_index[term].append({})     
            # Add doc ID to postings list.
            pos_index[term][1][fileno] = [pos]

    # Map the file no. to the file name.
    file_map[fileno] = "test/" + file_name

    # Increment the file no. counter for document ID mapping             
    fileno += 1

['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']


### displays each term 

In [13]:
pos_index

{'antoni': [3, {0: [0], 1: [0], 5: [0]}],
 'brutu': [3, {0: [1], 1: [1], 3: [0]}],
 'caeser': [5, {0: [2], 1: [2], 3: [1], 4: [0], 5: [1]}],
 'cleopatra': [1, {0: [3]}],
 'merci': [5, {0: [4], 2: [0], 3: [2], 4: [1], 5: [2]}],
 'worser': [4, {0: [5], 2: [1], 3: [3], 4: [2]}],
 'calpurnia': [1, {1: [3]}],
 'angel': [3, {6: [0], 7: [0], 8: [0]}],
 'fool': [4, {6: [1], 7: [1], 8: [1], 9: [0]}],
 'fear': [3, {6: [2], 7: [2], 9: [1]}],
 'in': [4, {6: [3], 7: [3], 8: [2], 9: [2]}],
 'rush': [4, {6: [4], 7: [4], 8: [3], 9: [3]}],
 'to': [4, {6: [5], 7: [5], 8: [4], 9: [4]}],
 'tread': [4, {6: [6], 7: [6], 8: [5], 9: [5]}]}

### Allow users to write phrase query 

In [14]:
test_term = input()
test_pos_index = pos_index[test_term]
print('term', test_term, 'and the count is', test_pos_index[0])
for doc in test_pos_index[1]:
    print(doc, ':', test_pos_index[1][doc])

term fool and the count is 4
6 : [1]
7 : [1]
8 : [1]
9 : [0]


# Third phase $:-$

In [15]:
documents = []
for file in os.listdir('Articles'):
    documents.append(" ".join(preprocessing(read_files(file))))

In [16]:
# Instantiate a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# It fits the data and transform it as a vector
X = vectorizer.fit_transform(documents)
# Convert the X as transposed matrix
X = X.T.toarray()
# Create a DataFrame and set the vocabulary as the index
df = pd.DataFrame(X, index=vectorizer.get_feature_names_out())

In [17]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.409883,0.409883,0.449365
antony,0.412627,0.0,0.474292,0.0,0.0,0.0,0.662993,0.0,0.0,0.0
brutus,0.412627,0.0,0.474292,0.0,0.571154,0.0,0.0,0.0,0.0,0.0
caeser,0.329457,0.0,0.378692,0.0,0.45603,0.555563,0.529358,0.0,0.0,0.0
calpurnia,0.0,0.0,0.637721,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cleopatra,0.554808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fear,0.0,0.449365,0.0,0.0,0.0,0.0,0.0,0.409883,0.409883,0.0
fools,0.0,0.399518,0.0,0.0,0.0,0.0,0.0,0.364415,0.364415,0.399518
in,0.0,0.399518,0.0,0.0,0.0,0.0,0.0,0.364415,0.364415,0.399518
mercy,0.329457,0.0,0.0,0.668165,0.45603,0.555563,0.529358,0.0,0.0,0.0


In [24]:
def get_similar_articles(q, df):
    print("query:", q)
    # Convert the query become a vector
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}  # Calculate the similarity
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
    
    # Sort the values 
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    # # Print the articles and their similarity values
    # for k, v in sim_sorted:
    #     if v != 0.0:
    #         print("Similarity value:", v)
    #         print("The article is:", documents[sim_sorted[k]])

    print("Similarity value:", sim_sorted[0][1])
    print("The article is:", documents[sim_sorted[0][0]])

In [25]:
q1 = 'fools fear'# Call the function
get_similar_articles(q1, df)

query: fools fear
Similarity value: 0.6012844514534148
The article is: fools fear in rush to tread
