In [1]:
import pickle
import os
from tqdm.notebook import tqdm,tnrange
from nltk.stem import PorterStemmer,SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import json

#punkt and stopqwords
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ussin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ussin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
os.chdir('../CSE508_Winter2023_Dataset/CSE508_Winter2023_Dataset_XML/')

In [3]:
def preprocess(filename):
    # open the file and read the text
    with open(filename, 'r') as f:
        text = f.read()
    # Lowercase the text
    text = text.lower()
    # Perform tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove punctuations but not hyphen separated words
    tokens = [w for w in tokens if w.isalpha() or '-' in w]
    # Remove blank space tokens
    tokens = [w for w in tokens if w.strip()]
    return tokens

# make a dictionary of bigram tokens as keys containg the list of files they occur in
bigram_dict = {}
def make_dict(filename, tokens):
    for i in range(len(tokens)-1):
        bigram = tokens[i] + ' ' + tokens[i+1]
        if bigram in bigram_dict:
            bigram_dict[bigram].append(filename)
        else:
            bigram_dict[bigram] = [filename]

In [4]:
# run the function preprocess and make_dict for all the files in the directory
for filename in tqdm(os.listdir()):
    tokens = preprocess(filename)
    make_dict(filename, tokens)

# save the dictionary in a pickle file
os.chdir('../InvertedIndex')
# delete file with given filename in directory
if os.path.exists('bigramindex.pickle'):
    os.remove('bigramindex.pickle')

# save the dictionary in a pickle file
with open('bigramindex.pickle', 'wb') as f:
    pickle.dump(bigram_dict, f)

# delete file with given filename in directory
if os.path.exists('bigramindex.txt'):
    os.remove('bigramindex.txt')

with open('bigramindex.txt', 'w') as f:
    f.write(json.dumps(bigram_dict, indent=4))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1400.0), HTML(value='')))




In [17]:
with open('bigramindex.pickle', 'rb') as handle:
    bigramindex = pickle.load(handle)

def get_posting_list(term):
    if term in bigramindex:
        return bigramindex[term]
    else:
        return []

def and_query(posting_list1, size1, posting_list2, size2):
    result = []
    i = 0
    j = 0
    while i < size1 and j < size2:
        if posting_list1[i] == posting_list2[j]:
            result.append(posting_list1[i])
            i += 1
            j += 1
        elif posting_list1[i] < posting_list2[j]:
            i += 1
        else:
            j += 1
    return result

def bigram_query(query):
    # query = input("Enter your query: ")
    # query = preprocess(query)
    query = query.lower()
    # Perform tokenization
    tokens = word_tokenize(query)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove punctuations but not hyphen separated words
    tokens = [w for w in tokens if w.isalpha() or '-' in w]
    # Remove blank space tokens
    tokens = [w for w in tokens if w.strip()]
    # query = set(tokens)
    query = list(tokens)
    # print(query)
    bi_query = []

    for i in range(len(query)-1):
        bi_query.append(query[i] + ' ' + query[i+1])
    length = len(bi_query)-1
    result = get_posting_list(bi_query[0])
    size = len(result)
    itr = 0
    while itr < length:
        result = and_query(result, size, get_posting_list(bi_query[itr+1]), len(get_posting_list(bi_query[itr+1])))
        size = len(result)
        itr += 1
    return result
    # if query in bigram_dict:
    #     print("The query is present in the following files:")
    #     for file in bigram_dict[query]:
    #         print(file)
    # else:
    #     print("The query is not present in any file.")  


In [18]:
bigram_query('possess in order to support')

['cranfield1396']

In [7]:
os.chdir('../CSE508_Winter2023_Dataset_XML/')

In [8]:
db = {}

# dictionary of token as key and list of wehere the filename itself is dictionary and positions as value
def make_dict(tokens, filename):
    for i in range(len(tokens)):
        if tokens[i] in db:
            if filename in db[tokens[i]]:
                db[tokens[i]][filename].append(i)
            else:
                db[tokens[i]][filename] = [i]
        else:
            db[tokens[i]] = {filename: [i]}

In [9]:
for filename in tqdm(os.listdir()):
    tokens = preprocess(filename)
    make_dict(tokens, filename)

# save the dictionary in a pickle file
os.chdir('../InvertedIndex')
# delete file with given filename in directory
if os.path.exists('positional.pickle'):
    os.remove('positional.pickle')

# save the dictionary in a pickle file
with open('positional.pickle', 'wb') as f:
    pickle.dump(db, f)

# delete file with given filename in directory
if os.path.exists('positional.txt'):
    os.remove('positional.txt')

with open('positional.txt', 'w') as f:
    f.write(json.dumps(db, indent=4))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1400.0), HTML(value='')))




In [21]:
os.chdir('../InvertedIndex')

# load positional index pickle file
with open('positional.pickle', 'rb') as f:
    db = pickle.load(f)


# positional query function that takes a query as input and returns the files that contain the query in the order of the query
def positional_query(query):
    query = query.lower()
    tokens = word_tokenize(query)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove punctuations but not hyphen separated words
    tokens = [w for w in tokens if w.isalpha() or '-' in w]
    # remove blank space tokens
    tokens = [w for w in tokens if w.strip()]
    # the db stored in the format of {token: {filename: [positions]}}
    # find the files that contain the first word in the query
    files = db[tokens[0]]
    # find the files that contain the rest of the words in the query
    for i in range(1, len(tokens)):
        files = [file for file in files if file in db[tokens[i]]]
    # find the files that contain the words in the query in the order of the query also they should be also if they are at same distance using position as in the query the disctionary stores {token: {filename: [positions]}}
    for file in files:
        positions = [db[tokens[0]][file]]
        for i in range(1, len(tokens)):
            positions.append(db[tokens[i]][file])
        # check if the positions are in the order of the query and also if they are at same distance using position as in the query the disctionary stores {token: {filename: [positions]}}
        for i in range(len(positions)-1):
            if(positions[i][-1] > positions[i+1][0]):
                files.remove(file)
                break
        
    return files

# get posting list function for positional index
def get_posting_list(token):
    if token in db:
        return list(db[token].items())
    else:
        return []

#positional intersect function which takes 2 posting list and takes distance as input and check is docid is same and if the positions are at same distance
# def positional_intersect(posting_list1, posting_list2, distance=1):
#     result = []
#     i = 0
#     j = 0
#     while i < len(posting_list1) and j < len(posting_list2):
#         if posting_list1[i][0] == posting_list2[j][0]:
#             if 1 <= posting_list1[i][1][-1] - posting_list2[j][1][0]  <= distance:
#                 result.append(posting_list1[i])
#             i += 1
#             j += 1
#         elif posting_list1[i][0] < posting_list2[j][0]:
#             i += 1
#         else:
#             j += 1
#     return result

# function that takes query and preprocesses it to get tokens iterate over it to get the posting list of each token and then intersect them to get the final result
# def positional_query(query):
#     query = query.lower()
#     tokens = word_tokenize(query)
#     # Remove stopwords
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
#     # remove punctuations
#     tokens = [w for w in tokens if w.isalpha() or '-' in w]
#     # remove blank space tokens
#     tokens = [w for w in tokens if w.strip()]
#     # the db stored in the format of {token: {filename: [positions]}}
#     # find the files that contain the first word in the query
#     posting_list = get_posting_list(tokens[0])
#     # find the files that contain the rest of the words in the query distance is the differece between psitions of the words in the query is 1 and step size is 1
#     for i in range(1, len(tokens)):
#          #distance between the words in the query is the third argumnet
#         posting_list = positional_intersect(posting_list, get_posting_list(tokens[i]), 1)
#     # now check same posting list for words seperated  by 2 words
#     for i in range(1, len(tokens),2):
#         posting_list = positional_intersect(posting_list, get_posting_list(tokens[i]), 2)
#     # now check same posting list for words seperated  by 3 words
#     for i in range(1, len(tokens),3):
#         posting_list = positional_intersect(posting_list, get_posting_list(tokens[i]), 3)
#     # now check same posting list for words seperated  by 4 words
#     for i in range(1, len(tokens),4):
#         posting_list = positional_intersect(posting_list, get_posting_list(tokens[i]), 4)
#     # now check same posting list for words seperated  by 5 words
#     for i in range(1, len(tokens),5):
#         posting_list = positional_intersect(posting_list, get_posting_list(tokens[i]), 5)

#     return posting_list
# for i in range(len(positions)-1):
#             if positions[i][-1] > positions[i+1][0]:
#                 files.remove(file)
#                 break


# for i in range(len(positions)-1):
        #     d = 0
        #     if not(d >= positions[i+1][0] - positions[i][-1] >= 1):
        #         files.remove(file)
        #         break


def positional_query(query, distance=1):
    query = query.lower()
    tokens = word_tokenize(query)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove punctuations
    tokens = [w for w in tokens if w.isalpha()]
    # remove blank space tokens
    tokens = [w for w in tokens if w.strip()]
    # find the files that contain the first word in the query
    files = db[tokens[0]]
    # find the files that contain the rest of the words in the query
    for i in range(1, len(tokens)):
        files = [file for file in files if file in db[tokens[i]]]
    # find the files that contain the words in the query in the order of the query


    result = []
    for file in files:
        positions = [db[tokens[0]][file]]
        for i in range(1, len(tokens)):
            # final=[]
            positions.append(db[tokens[i]][file])
        for i in range(len(positions)-1):
            final=[]
            a =0
            b =0
            pos1=positions[i]
            pos2=positions[i+1]
            while a < len(pos1) and b < len(pos2):
                if (distance >= pos2[b]-pos1[a] >= 1):
                    final.append(pos2[b])
                    a+=1
                    b+=1
                elif pos1[a] > pos2[b]:
                    b += 1
                else:
                    a += 1
            positions[i+1]=final
        if len(positions[-1]) != 0:
            result.append(file)
    return result

In [24]:
def main():
    n = int(input())
    queries=[]
    for j in range(n):
        s = input()
        queries.append(s)
    
    for i in range(n):
        query1 = queries[i]
        query2 = queries[i]
        bi_result = bigram_query(query1)
        po_result = positional_query(query2)
        print("Number of documents retrieved for query {} using bigram inverted index: {}".format(i+1, len(bi_result)))
        print("Names of documents retrieved for query {} using using bigram inverted index: {}".format(i+1, bi_result))
        print("Number of documents retrieved for query {} using positional index: {}".format(i+1, len(po_result)))
        print("Names of documents retrieved for query {} using positional index: {}".format(i+1, po_result))

main()

Number of documents retrieved for query 1 using bigram inverted index: 0
Names of documents retrieved for query 1 using using bigram inverted index: []
Number of documents retrieved for query 1 using positional index: 1
Names of documents retrieved for query 1 using positional index: ['cranfield1396']
