## Libraries

In [262]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from natsort import natsorted
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np

import math

## Read files

In [263]:
def read_files(file):
    if 'txt' in file:
        with open(f'Articles/'+file, 'r', encoding='latin1') as f:
            return f.read()

In [264]:
documents = []
for file in os.listdir('Articles'):
    documents.append(read_files(file))

In [265]:
len(documents)

10

# First Phase $:-$

## Apply tokenization

In [266]:
token_docs = []
for document in documents:
    token_docs.append(word_tokenize(document))

## Stop words

In [267]:
stop_words = stopwords.words('english')

#### Remove in , to from stop words
#### Add some extra punctuation

In [268]:
stop_words.remove('in')
stop_words.remove('to')

stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&"])

In [269]:
documents = []
for token in token_docs:
    each_token = []
    for term in token:
        if term not in stop_words:
            each_token.append(term)
    documents.append(each_token)

In [270]:
documents[0][:10]

['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser']

# Second phase $:-$

### Implement function to do all steps in first phase

In [271]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    stop_words = stopwords.words('english')
    stop_words.remove('in')
    stop_words.remove('to')
    stop_words.remove('where')
    stop_words.extend([".", ",", "'", "-", "_", ":", "(", ")", "&", "/", "\\", "]", "[", "''", '""', "' '", '" "'])
    prepared_doc = []
    for term in token_docs:
        if term not in stop_words:
            prepared_doc.append(term)
    return prepared_doc


In [272]:
# Initialize the stemmer.
stemmer = PorterStemmer()
 
# Initialize the file no.
fileno = 1
 
# Initialize the dictionary.
pos_index = {}
 
# Initialize the file mapping (fileno -> file name).
file_map = {}

In [273]:
# Open files.
file_names = natsorted(os.listdir("Articles"))
print(file_names)
# For every file.
for file_name in file_names:

    # Read file contents.
    with open(f'Articles/{file_name}', 'r', encoding='latin1') as f:
        stuff = f.read()
    # This is the list of words in order of the text.
    # We need to preserve the order because we require positions.
    # 'preprocessing' function does some basic punctuation removal,
    # stopword removal etc.
    final_token_list = preprocessing(stuff)

    # For position and term in the tokens.
    for pos, term in enumerate(final_token_list):
        # print(pos, '-->' ,term)
        # First stem the term.
        term = stemmer.stem(term)
        # print(term)
        # If term already exists in the positional index dictionary.
        if term in pos_index:
                
            # Increment total freq by 1.
            pos_index[term][0] = pos_index[term][0] + 1
                
            # Check if the term has existed in that DocID before.
            if fileno in pos_index[term][1]:
                pos_index[term][1][fileno].append(pos)
                    
            else:
                pos_index[term][1][fileno] = [pos]

        # If term does not exist in the positional index dictionary
        # (first encounter).
        else:
                
            # Initialize the list.
            pos_index[term] = []
            # The total frequency is 1.
            pos_index[term].append(1)
            # The postings list is initially empty.
            pos_index[term].append({})     
            # Add doc ID to postings list.
            pos_index[term][1][fileno] = [pos]

    # Map the file no. to the file name.
    file_map[fileno] = "test/" + file_name

    # Increment the file no. counter for document ID mapping             
    fileno += 1

['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']


### displays each term 

In [274]:
pos_index

{'antoni': [3, {1: [0], 2: [0], 6: [0]}],
 'brutu': [3, {1: [1], 2: [1], 4: [0]}],
 'caeser': [5, {1: [2], 2: [2], 4: [1], 5: [0], 6: [1]}],
 'cleopatra': [1, {1: [3]}],
 'merci': [5, {1: [4], 3: [0], 4: [2], 5: [1], 6: [2]}],
 'worser': [4, {1: [5], 3: [1], 4: [3], 5: [2]}],
 'calpurnia': [1, {2: [3]}],
 'angel': [3, {7: [0], 8: [0], 9: [0]}],
 'fool': [4, {7: [1], 8: [1], 9: [1], 10: [0]}],
 'fear': [3, {7: [2], 8: [2], 10: [1]}],
 'in': [4, {7: [3], 8: [3], 9: [2], 10: [2]}],
 'rush': [4, {7: [4], 8: [4], 9: [3], 10: [3]}],
 'to': [4, {7: [5], 8: [5], 9: [4], 10: [4]}],
 'tread': [4, {7: [6], 8: [6], 9: [5], 10: [5]}],
 'where': [4, {7: [7], 8: [7], 9: [6], 10: [6]}]}

### Allow users to write phrase query 

In [275]:
test_term = input()
test_pos_index = pos_index[test_term]
print('Term :', test_term, '\nCount :', test_pos_index[0])
for doc in test_pos_index[1]:
    print(doc, ':', test_pos_index[1][doc])

Term : worser 
Count : 4
1 : [5]
3 : [1]
4 : [3]
5 : [2]


# Third phase $:-$

In [293]:
documents = []
files = os.listdir('Articles')
for file in range(1, 11):
    documents.append(" ".join(preprocessing(read_files(str(file)+'.txt'))))

In [304]:
all_terms = []
for doc in documents:
    for term in doc.split():
        all_terms.append(term)
all_terms = set(all_terms)

## Term Frequency
$$ tf = \frac{number of times the term appears in a document} {otal number of words in the document}$$

In [309]:
def get_tf(document):
    wordDict = dict.fromkeys(all_terms, 0)
    for word in document.split():
        wordDict[word]+=1
    return wordDict

In [312]:
tf = pd.DataFrame(get_tf(documents[0]).values(), index=get_tf(documents[0]).keys())
for i in range(1, len(documents)):
    tf[i] = get_tf(documents[i]).values()
tf.columns = ['doc'+str(i) for i in range(1, 11)]

In [313]:
tf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
fear,0,0,0,0,0,0,1,1,0,1
angels,0,0,0,0,0,0,1,1,1,0
caeser,1,1,0,1,1,1,0,0,0,0
tread,0,0,0,0,0,0,1,1,1,1
fools,0,0,0,0,0,0,1,1,1,1
brutus,1,1,0,1,0,0,0,0,0,0
in,0,0,0,0,0,0,1,1,1,1
antony,1,1,0,0,0,1,0,0,0,0
to,0,0,0,0,0,0,1,1,1,1
rush,0,0,0,0,0,0,1,1,1,1


## Inverse Document Frequency
$$ idf = \frac{number of the documents in the corups} {number of documents in the corups contain the term}$$

In [319]:
tdf = pd.DataFrame(columns=['df', 'idf'])
for i in range(len(tf)):
    in_term = tf.iloc[i].values.sum()

    tdf.loc[i, 'df'] = in_term

    tdf.loc[i, 'idf'] = math.log10(10 / (float(in_term)))
    
tdf.index=tf.index

In [320]:
tdf

Unnamed: 0,df,idf
fear,3,0.522879
angels,3,0.522879
caeser,5,0.30103
tread,4,0.39794
fools,4,0.39794
brutus,3,0.522879
in,4,0.39794
antony,3,0.522879
to,4,0.39794
rush,4,0.39794


## TF.IDF

In [326]:
tf_idf = tf.multiply(tdf['idf'], axis=0)

In [327]:
tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.0,0.522879
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.522879,0.0
caeser,0.30103,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
tread,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
fools,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
brutus,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
in,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
antony,0.522879,0.522879,0.0,0.0,0.0,0.522879,0.0,0.0,0.0,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794


In [94]:
def get_similar_articles(q, df):
    print("query:", q)
    # Convert the query become a vector
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)

    sim = {}  # Calculate the similarity
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)

    # Sort the values 
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    # Print the articles and their similarity values
    for doc, score in sim_sorted:
        if score > 0.5:
            print("Similarity value:", score)
            print("The article is:", doc)

    # print("Similarity value:", sim_sorted[0][1])
    # print("sim =", sim_sorted[0])
    # print("doc =", documents)
    # print("The article is:", documents[sim_sorted[0][0]])

In [95]:
q1 = 'antony brutus' # Call the function
get_similar_articles(q1, df)

query: antony brutus
Similarity value: 0.6707495990011653
The article is: 1
Similarity value: 0.5835428051584892
The article is: 0
