## Libraries

In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from natsort import natsorted

import pandas as pd
import numpy as np

import math

from sklearn.feature_extraction.text import TfidfVectorizer

## Read files

In [2]:
def read_files(file):
    if 'txt' in file:
        with open(f'Articles/'+file, 'r') as f:
            return f.read()

In [3]:
documents = []
for file in os.listdir('Articles'):
    documents.append(read_files(file))

In [4]:
documents

['antony brutus caeser cleopatra mercy worser',
 'fools fear in rush to tread where',
 'antony brutus caeser calpurnia ',
 'mercy worser',
 'brutus caeser mercy worser',
 'caeser mercy worser',
 'antony caeser mercy ',
 'angels fools fear in rush to tread where',
 'angels fools fear in rush to tread where',
 'angels fools in rush to tread where']

# First Phase $:-$

## Apply tokenization

In [5]:
token_docs = []
for document in documents:
    token_docs.append(word_tokenize(document))

## Stop words

In [6]:
stop_words = stopwords.words('english')

#### Remove in , to from stop words
#### Add some extra punctuation

In [7]:
stop_words.remove('in')
stop_words.remove('to')
stop_words.remove('where')

In [8]:
documents = []
for token in token_docs:
    each_token = []
    for term in token:
        if term not in stop_words:
            each_token.append(term)
    documents.append(each_token)

In [9]:
documents[0][:10]

['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser']

In [10]:
documents

[['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser'],
 ['fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['antony', 'brutus', 'caeser', 'calpurnia'],
 ['mercy', 'worser'],
 ['brutus', 'caeser', 'mercy', 'worser'],
 ['caeser', 'mercy', 'worser'],
 ['antony', 'caeser', 'mercy'],
 ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['angels', 'fools', 'in', 'rush', 'to', 'tread', 'where']]

# Second phase $:-$

### Implement function to do all steps in first phase

In [11]:
def preprocessing(doc):
    token_docs = word_tokenize(doc)
    stop_words = stopwords.words('english')
    stop_words.remove('in')
    stop_words.remove('to')
    stop_words.remove('where')

    prepared_doc = []
    for term in token_docs:
        if term not in stop_words:
            prepared_doc.append(term)
    return prepared_doc


In [12]:
# Initialize the file no.
fileno = 1

# Initialize the dictionary.
pos_index = {}

In [13]:
# Open files.
file_names = natsorted(os.listdir("Articles"))
print(file_names)
# For every file.
for file_name in file_names:

    # Read file contents.
    with open(f'Articles/{file_name}', 'r') as f:
        doc = f.read()
    # preprocess doc
    final_token_list = preprocessing(doc)

    # For position and term in the tokens.
    for pos, term in enumerate(final_token_list):
        # print(pos, '-->' ,term)
        
        # If term already exists in the positional index dictionary.
        if term in pos_index:
                
            # Increment total freq by 1.
            pos_index[term][0] = pos_index[term][0] + 1
            
            # Check if the term has existed in that DocID before.
            if fileno in pos_index[term][1]:
                pos_index[term][1][fileno].append(pos)
                    
            else:
                pos_index[term][1][fileno] = [pos]

        # If term does not exist in the positional index dictionary
        else:
            
            # Initialize the list.
            pos_index[term] = []
            # The total frequency is 1.
            pos_index[term].append(1)
            # The postings list is initially empty.
            pos_index[term].append({})     
            # Add doc ID to postings list.
            pos_index[term][1][fileno] = [pos]

    # Increment the file no. counter for document ID mapping             
    fileno += 1

['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']


### displays each term 

In [14]:
pos_index

{'antony': [3, {1: [0], 2: [0], 6: [0]}],
 'brutus': [3, {1: [1], 2: [1], 4: [0]}],
 'caeser': [5, {1: [2], 2: [2], 4: [1], 5: [0], 6: [1]}],
 'cleopatra': [1, {1: [3]}],
 'mercy': [5, {1: [4], 3: [0], 4: [2], 5: [1], 6: [2]}],
 'worser': [4, {1: [5], 3: [1], 4: [3], 5: [2]}],
 'calpurnia': [1, {2: [3]}],
 'angels': [3, {7: [0], 8: [0], 9: [0]}],
 'fools': [4, {7: [1], 8: [1], 9: [1], 10: [0]}],
 'fear': [3, {7: [2], 8: [2], 10: [1]}],
 'in': [4, {7: [3], 8: [3], 9: [2], 10: [2]}],
 'rush': [4, {7: [4], 8: [4], 9: [3], 10: [3]}],
 'to': [4, {7: [5], 8: [5], 9: [4], 10: [4]}],
 'tread': [4, {7: [6], 8: [6], 9: [5], 10: [5]}],
 'where': [4, {7: [7], 8: [7], 9: [6], 10: [6]}]}

### Allow users to write phrase query 

In [61]:
def put_query(q, display=1):
    lis = [[] for i in range(10)]
    for term in q.split():

        if term in pos_index.keys():
            for key in pos_index[term][1].keys():
            
                if lis[key-1] != []:
                    
                    if lis[key-1][-1] == pos_index[term][1][key][0]-1:
                        lis[key-1].append(pos_index[term][1][key][0])
                else:
                    lis[key-1].append(pos_index[term][1][key][0])
    positions = []
    if display==1:
        for pos, list in enumerate(lis, start=1):
            if len(list) == len(q.split()):
                positions.append('document '+str(pos))
        return positions
    else:
        for pos, list in enumerate(lis, start=1):
            if len(list) == len(q.split()):
                positions.append('doc'+str(pos))
        return positions

In [62]:
q = 'fools fear'
put_query(q)

['document 7', 'document 8', 'document 10']

# Third phase $:-$

In [17]:
documents = []
files = os.listdir('Articles')
for file in range(1, 11):
    documents.append(" ".join(preprocessing(read_files(str(file)+'.txt'))))

In [18]:
all_terms = []
for doc in documents:
    for term in doc.split():
        all_terms.append(term)
all_terms = set(all_terms)

## Term Frequency
$$ tf = \frac{number of times the term appears in a document} {total number of words in the document}$$

In [19]:
def get_tf(document):
    wordDict = dict.fromkeys(all_terms, 0)
    for word in document.split():
        wordDict[word] += 1
    return wordDict

In [20]:
tf = pd.DataFrame(get_tf(documents[0]).values(), index=get_tf(documents[0]).keys())
for i in range(1, len(documents)):
    tf[i] = get_tf(documents[i]).values()
tf.columns = ['doc'+str(i) for i in range(1, 11)]

In [21]:
tf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
in,0,0,0,0,0,0,1,1,1,1
brutus,1,1,0,1,0,0,0,0,0,0
to,0,0,0,0,0,0,1,1,1,1
angels,0,0,0,0,0,0,1,1,1,0
mercy,1,0,1,1,1,1,0,0,0,0
fear,0,0,0,0,0,0,1,1,0,1
rush,0,0,0,0,0,0,1,1,1,1
fools,0,0,0,0,0,0,1,1,1,1
cleopatra,1,0,0,0,0,0,0,0,0,0
where,0,0,0,0,0,0,1,1,1,1


## Weighted tf(1+ log tf)

In [22]:
def weighted_tf(x):
    if x > 0:
        return math.log10(x) + 1
    return 0

In [23]:
w_tf = tf.copy()
for i in range(0, len(documents)):
    w_tf['doc'+str(i+1)] = tf['doc'+str(i+1)].apply(weighted_tf)

In [24]:
w_tf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
in,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
brutus,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
angels,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
mercy,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
rush,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
fools,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0


## Inverse Document Frequency
$$ idf = \frac{number of the documents in the corups} {number of documents in the corups contain the term}$$

In [25]:
tdf = pd.DataFrame(columns=['df', 'idf'])
for i in range(len(tf)):
    in_term = w_tf.iloc[i].values.sum()

    tdf.loc[i, 'df'] = in_term

    tdf.loc[i, 'idf'] = math.log10(10 / (float(in_term)))

tdf.index=w_tf.index

In [26]:
tdf

Unnamed: 0,df,idf
in,4.0,0.39794
brutus,3.0,0.522879
to,4.0,0.39794
angels,3.0,0.522879
mercy,5.0,0.30103
fear,3.0,0.522879
rush,4.0,0.39794
fools,4.0,0.39794
cleopatra,1.0,1.0
where,4.0,0.39794


## TF.IDF

In [27]:
tf_idf = w_tf.multiply(tdf['idf'], axis=0)

In [28]:
tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
in,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
brutus,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.522879,0.0
mercy,0.30103,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.0,0.522879
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
fools,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794


## Document length

In [29]:
def get_doc_len(col):
    return np.sqrt(tf_idf[col].apply(lambda x: x**2).sum())

In [30]:
doc_len = pd.DataFrame()
for col in tf_idf.columns:
    doc_len.loc[0, col+'_length']= get_doc_len(col)


In [31]:
doc_len

Unnamed: 0,doc1_length,doc2_length,doc3_length,doc4_length,doc5_length,doc6_length,doc7_length,doc8_length,doc9_length,doc10_length
0,1.373462,1.279618,0.498974,0.782941,0.582747,0.67427,1.223496,1.223496,1.106137,1.106137


In [32]:
doc_len['doc1_length'].values[0]

1.3734623153231016

## Normalized TF.IDF

In [33]:
def get_norm_tf_idf(col, x):
    try:
        return x / doc_len[col+'_length'].values[0]
    except:
        return 0

In [34]:
norm_tf_idf = pd.DataFrame()
for col in tf_idf.columns:
    norm_tf_idf[col] = tf_idf[col].apply(lambda x : get_norm_tf_idf(col, x))

In [35]:
norm_tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
in,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
brutus,0.380701,0.408621,0.0,0.667839,0.0,0.0,0.0,0.0,0.0,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.472707,0.0
mercy,0.219176,0.0,0.603298,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.0,0.472707
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
fools,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
cleopatra,0.728087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
where,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756


## Input Query

In [36]:
def get_w_tf(x):
    try:
        return math.log10(x)+1
    except:
        return 0

In [63]:
def insert_query(q):
    docs_found = put_query(q, 2)
    if docs_found == []:
        return "Not Fount"
    query = pd.DataFrame(index=norm_tf_idf.index)
    query['tf'] = [1 if x in q.split() else 0 for x in list(norm_tf_idf.index)]
    query['w_tf'] = query['tf'].apply(lambda x : get_w_tf(x))
    product = norm_tf_idf.multiply(query['w_tf'], axis=0)
    query['idf'] = tdf['idf'] * query['w_tf']
    query['tf_idf'] = query['w_tf'] * query['idf']
    query['normalized'] = 0
    for i in range(len(query)):
        query['normalized'].iloc[i] = float(query['idf'].iloc[i]) / math.sqrt(sum(query['idf'].values**2))
    print('Query Details')
    print(query.loc[q.split()])
    product2 = product.multiply(query['normalized'], axis=0)
    scores = {}
    for col in put_query(q):
            scores[col] = product2[col].sum()
    product_result = product2[list(scores.keys())].loc[q.split()]
    print()
    print('Product (query*matched doc)')
    print(product_result)
    print()
    print('product sum')
    print(product_result.sum())
    print()
    print('Query Length')
    q_len = math.sqrt(sum([x**2 for x in query['idf'].loc[q.split()]]))
    print(q_len)
    print()
    print('Cosine Simliarity')
    print(product_result.sum())
    print()
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    print('Returned docs')
    for typle in sorted_scores:
        print(typle[0], end=" ")

In [65]:
insert_query('antony brutus')

Query Details
        tf  w_tf       idf    tf_idf  normalized
antony   1   1.0  0.522879  0.522879    0.707107
brutus   1   1.0  0.522879  0.522879    0.707107

Product (query*matched doc)
            doc1      doc2
antony  0.269196  0.288939
brutus  0.269196  0.288939

product sum
doc1    0.538393
doc2    0.577877
dtype: float64

Query Length
0.7394622130520805

Cosine Simliarity
doc1    0.538393
doc2    0.577877
dtype: float64

Returned docs
doc2 doc1 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query['normalized'].iloc[i] = float(query['idf'].iloc[i]) / math.sqrt(sum(query['idf'].values**2))
