In [1]:
import os
from pprint import pprint
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
from nltk import WordNetLemmatizer
from nltk import regexp_tokenize
from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk import WordPunctTokenizer
import math
from num2words import num2words
from decimal import Decimal
from word2number import w2n

stop_ws = set(stopwords.words('english'))

In [2]:
def documentText(file_name):
    """
        Args:
            file_name (str) : file name 
        return:
            string (str) : file content
    """

    with open(stories_path+'/'+file_name, 'rb') as f:
        file_content = str(f.read())
    
    return file_content

In [3]:
def getData(path):
    """
        Args:
            path (str): index.html file location
        Returns:
            dataframe:
    """
    
    loc = path + 'index.html'
    with open(loc, 'r') as f:
        data = str(f.read())
        
    sp = BeautifulSoup(data, "lxml")
    filename = []
    title = []

    for t in sp.find_all('tr'):
        flag = False

        for c in t.find_all('td'):
            if(not flag):    
                for a in c.find('a'):
                    filename.append(a)
                    flag = True  
            if(c.string != None):
                title.append(c.string.rstrip())
                flag = False
        break

    docs_frame = pd.DataFrame(columns = ["docid", "name", "title", "content"])
    # pprint(list(zip(filename, title)))

    f = 1
    for i in range(0, len(filename)):
        text = documentText(filename[i])
        docs_frame = docs_frame.append(pd.Series([f, filename[i], title[i], text], index = ["docid", "name", "title","content"]), ignore_index=True)
        f+=1

    return docs_frame

In [4]:
def tokenizeDocument(docs):
    """
        Args:
            docs (dict) : all documents
        return:
            final tokens (list) : terms for index

        0. Convert to lowercase
        1. Stop words removed
        2. Tokenize 
        3. Stemming
        4. Lemmatization
        5. Only words that starts with alphabet or digit. Front 0's removed. Num2words
    """

    

    ts = docs.split('\\n')
    docs = ' '.join(ts)
    ts = docs.split('\t')

    docs = ' '.join(ts)
  
    # Tokenization
    tokens = WordPunctTokenizer().tokenize(docs)

    # lowercase
    tokens_lowercase = [ w.lower() for w in tokens]
    
    #Remove Stop words
    tokens_stop  = [ w for w in tokens_lowercase if(w not in stop_ws)] 
    
    # Stemming 
    tokens_stem = [ PorterStemmer().stem(w) for w in tokens_stop]   # .wes. we

    # Lemmatization
    updated_tokens = [ WordNetLemmatizer().lemmatize(w) for w in tokens_stem]
     
    final_tokens = []
    
    for updated_token in updated_tokens:
        if(updated_token[0].isalpha()) and (len(updated_token) > 1):
            final_tokens.append(updated_token)
        else:
            if(updated_token.isnumeric()):
                final_tokens.append(num2words(Decimal(updated_token)))
                
#                 final_tokens.append(str(int(updated_token)))
#             else:
#                 if(updated_token[0].isdigit()):
#                     updated_token = updated_token.lstrip('0')
#                     final_tokens.append(updated_token)
        
    
    final_tokens = final_tokens[1:]  # remove b

    return final_tokens

In [5]:
def tokenizeInput(inp):
    """
        inp : Not a stop word
        
        0. Convert to lowercase
        1. Stemming
        2. Lemmatization
        3. Only words that starts with alphabet or digit.
    """
    
    if(inp.isnumeric()):
#         print('y')
        return num2words(Decimal(inp))
    
    try:
        if(isinstance(w2n.word_to_num(inp), int)):
            return inp
    
    except ValueError:
        pass
       
    
    # lowercase
    inp = inp.lower()
    
    # Stemming 
    inp_stem = PorterStemmer().stem(inp)
    
    # Lemmatization
    inp_lemma = WordNetLemmatizer().lemmatize(inp_stem)
    
    inp_lemma = inp_lemma.lstrip('0')
    
    #strip spaces 
    inp_lemma = inp_lemma.strip()
    
    return inp_lemma


In [6]:
#### Inverted Index ####

def makeIndex(token_docid):
    inverted_index = {}

    for element in token_docid:
        term = element[0]
        docid = element[1]
        if(term not in inverted_index.keys()):
            postings_list = [0,{}]  # df, key = docid, value = tf
            postings_list[0] = 1
            postings_list[1][docid] = 1 #tf 
            inverted_index[term] = postings_list
        else:
            plist = inverted_index[term] 
            if docid not in plist[1].keys():
                plist[1][docid] = 1 #tf 
                plist[0]+=1
            else:
                plist[1][docid]+=1

    return inverted_index


In [7]:
def queryTermsOR(qts, title_index, text_index):
    docs_found = []
    
    for qt in qts:
        if(qt in text_index.keys()):
            for doc_id in text_index[qt][1].keys():
                if(doc_id not in docs_found):
                    docs_found.append([qt, doc_id])
                    
        if(qt in title_index.keys()):
            for doc_id in title_index[qt][1].keys():
                if(doc_id not in docs_found):
                    docs_found.append([qt, doc_id])
            
    return docs_found


In [8]:
def getResults(query_docs, title_index, text_index):
    score = []

    for qt_docid in query_docs:
        a = 0
        b = 0
        #check qt_docid[1] is from title or text index
        
        if qt_docid[0] in title_index.keys():
            
            if qt_docid[1] in title_index[qt_docid[0]][1].keys():
                
                tf_title = 1 + math.log(title_index[qt_docid[0]][1][qt_docid[1]], 10)
                idf_title = math.log(N/title_index[qt_docid[0]][0], 10)
                a = 0.7 * (tf_title) * (idf_title)
            
        else:
            a = 0
        

        if qt_docid[0] in text_index.keys():  
            
            if qt_docid[1] in text_index[qt_docid[0]][1].keys():
                
                tf_all = 1 + math.log(text_index[qt_docid[0]][1][qt_docid[1]], 10)
                idf_all = math.log(N/text_index[qt_docid[0]][0], 10)

                if(a!=0):
                    tf_body = tf_all - tf_title
                    idf_body = idf_all - idf_title
                    b = 0.3 * (tf_body * idf_body)
                else:
                    tf_body = tf_all 
                    idf_body = idf_all 
                    b = 0.3 * (tf_body * idf_body)
              
        else:
            b = 0

        s = a + b
        score.append([s, qt_docid[0],qt_docid[1]])   # tfidf, term, doc_id

    return score

In [9]:
def docsVec():
    global tdm
    k = 0
    for term, plist in text_inverted_index.items():  # every term,plist 
        for i in range(1,N+1):      # every doc id
            # calculate tf idf of term, for every doc id
            if i in plist[1].keys():
                c = getResults([[term, i]], title_inverted_index, text_inverted_index) # tfidf, term, doc_id
                tdm[k][i-1] = c[0][0]
        k+=1

In [10]:
def queryVec(query_tokens):
    
    query_vec = np.zeros((V,1))
    k= 0
    unq, cnt = np.unique(query_tokens, return_counts = True)

    for term in text_inverted_index.keys():
        if(term in query_tokens): 
            # Query vector # find index of  term in unq then get cnt[index]
            pos = np.nonzero(unq == term)[0][0]
            tf_query = 1 + math.log(cnt[pos], 10) 
            
            df_from_textIn = text_inverted_index[term][0]
            
            if(term in title_inverted_index.keys()):
                df_from_titleIn = title_inverted_index[term][0]
            else:
                df_from_titleIn = 0
            
            body_df = df_from_textIn - df_from_titleIn
#             print(df_from_textIn, df_from_titleIn)
            if(df_from_titleIn!=0 and body_df!=0):
                
                idf_query = 0.7 * math.log(N/df_from_titleIn, 10) + 0.3 * (math.log(N/body_df, 10))
            else:
                idf_query = math.log(N/df_from_textIn, 10)
                
            query_vec[k] = tf_query * idf_query
        else:
            query_vec[k] = 0
        k+=1
    
    return query_vec

In [11]:
def  displayResult(ans, r, cs = False):
    top_answer = sorted(ans, reverse=True) # Top 10 results
    # print(top_answer)
    best_ans = []

    for ik in top_answer:
        if(ik not in best_ans):
            best_ans.append(ik)

    if(cs):
        show_result = pd.DataFrame(best_ans, columns =["score","docid"])
    else:
        show_result = pd.DataFrame(best_ans, columns =["score", "qterm", "docid"])
        
    show_docName = show_result.iloc[:,-1].unique()[:r]
    
    print("")
    for docid in show_docName:
        print(docs_table.iloc[docid-1, 1])
    print("")
    
    return

In [14]:

# Get strories title from index.html
stories_path = os.getcwd() + "/stories/"
index_path = os.getcwd() + "/stories/index/"

docs_table  = getData(index_path)
N = docs_table.shape[0]


token_docId = []
tk_docId = []


for i in range(0, docs_table.shape[0]):
    doc_id = docs_table.iloc[i, 0]
    title = docs_table.iloc[i, 2]
    content = docs_table.iloc[i, 3]
    title_tokens = tokenizeDocument(title)
    content_tokens = tokenizeDocument(content)

    for token in title_tokens:
        token_docId.append([token, doc_id])
        
    for token in content_tokens:
        tk_docId.append([token, doc_id])

In [15]:

# Title Inverted Index
title_inverted_index = makeIndex(token_docId)

text_inverted_index = makeIndex(tk_docId)

# pprint(text_inverted_index['dragon'])

# print(len(text_inverted_index), len(title_inverted_index))

V = len(text_inverted_index)

In [16]:
# Q1. Simple tf-idf
def q1(query):
    
    query_terms = query.split(' ')
    query_tokens = [ tokenizeInput(term) for term in query_terms ]

#     print(query_tokens)
    query_docs = queryTermsOR(query_tokens, title_inverted_index, text_inverted_index)
    # print(query_docs)

    ans = getResults(query_docs, title_inverted_index, text_inverted_index)

    # Take input from user
    k = 10

    #show docs names
    displayResult(ans, k)

In [17]:
# Q2. VSM

# Term Document Matrix
tdm = np.zeros((V,N))
docsVec()
# Normalize document vectors
euclid_dist = np.linalg.norm(tdm, axis=0)
# print(euclid_dist)
# new_dist = (1/euclid_dist)

for c in range(0, tdm.shape[1]):
#     tdm[:,c] = new_dist[c]*tdm[:,c]    
    tdm[:,c] = tdm[:,c]/euclid_dist[c]    

In [18]:
def q2(query):

    query_terms = query.split(' ')
    query_tokens = [ tokenizeInput(term) for term in query_terms if(term not in stop_ws) ]
#     print(query_tokens)
    query_docs = queryTermsOR(query_tokens, title_inverted_index, text_inverted_index)

    #Query Vecor
    query_vec = queryVec(query_tokens)
    

    # normalize query vector
    euclid_dist_qy = np.linalg.norm(query_vec)
    # print(euclid_dist_qy)
    query_vec = (1/euclid_dist_qy)*query_vec
    # print(query_vec.T[0])
    query_vec_new = np.array(list(query_vec.flat))
    # Cosine similarity
    possible_ans = [ qt_docid[1] for qt_docid in query_docs]

    similarity = []
    for doc_id in possible_ans:
        match_score = np.inner(tdm[:,doc_id-1], query_vec_new)
        similarity.append([match_score, doc_id])
        
    top_ans = sorted(similarity, reverse=True)
    displayResult(top_ans, 10, True)


In [None]:
run = True
while(run):
    print("1. Tf-idf")
    print("2. Cosine similarity with tf-idf")

    t = input('Choose search type : ')

    if(t.isnumeric()):
        if (int(t)== 1):
            query = input("Search : ")
            q1(query)
            
        elif(int(t) == 2):
            query = input("Search : ")
            q2(query)
            
        else:
            run = False
            print("Exiting...")
    else:
        run = False
        print("Exiting...")  

1. Tf-idf
2. Cosine similarity with tf-idf
Choose search type : 1
Search : The Adveniure of the Three Gables

3gables.txt
lionmane.txt
wisteria.txt
rocket.sf
darkness.txt
ltp
descent.poe
hound-b.txt
hitch3.txt
cybersla.txt

1. Tf-idf
2. Cosine similarity with tf-idf
Choose search type : 2
Search : The Adveniure of the Three Gables

3gables.txt
lionmane.txt
wisteria.txt
hound-b.txt
hitch3.txt
cybersla.txt

1. Tf-idf
2. Cosine similarity with tf-idf
Choose search type : 1
Search : for the news which Lestrde would bring

6napolen.txt
breaks2.asc
robotech
bureau.txt
hitch2.txt
hellmach.txt
radar_ra.txt
sre04.txt
rocket.sf
outcast.dos

1. Tf-idf
2. Cosine similarity with tf-idf
Choose search type : 2
Search : for the news which Lestrde would bring

6napolen.txt
adv_alad.txt
bullove.txt
aminegg.txt
crabhern.txt
fleas.txt
valen
empnclot.txt
bran
weeprncs.txt

1. Tf-idf
2. Cosine similarity with tf-idf
