In [10]:
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
import time
from math import log

## Getting subcorpus of 10,000 tokens

In [11]:
content_tokenizer = RegexpTokenizer('<TEXT.*?>(.*?)</TEXT>')
article_tokenizer = RegexpTokenizer('<REUTERS(.*?)</REUTERS>')
id_tokenizer = RegexpTokenizer('NEWID="(.*?)"')
metadata_tokenizer = RegexpTokenizer('<.*?>')
html_entities_tokenizer = RegexpTokenizer('^&.*?;')
token_count = 0
sub_corpus = []


def remove_metadata(text):
    tags = metadata_tokenizer.tokenize(text)
    html_entities = html_entities_tokenizer.tokenize(text)
    metadata = tags + html_entities
    if len(metadata) > 0:
        for element in metadata:
            text = text.replace(element, '')
    return text


def remove_punctuation(tokens):
    punctuation_list = [*".,:;-<>{}()[]~`&*?"]
    double_symbols = ['""', "''", "``", "...", "--", "-", ","]
    punctuation_list += double_symbols
    for t in tokens:
        if t in punctuation_list:
            tokens.remove(t)
    return tokens


for i in range(22):
    doc = f"./reuters21578/reut2-0{i:02d}.sgm"
    try:
        if i != 17:
            with open(doc, 'rt') as file:
                file = file.read()
        else:
            # Needed as file 17 gave me UnicodeDecodeError
            file = open(doc, mode="rb")
            file = file.read()
            file = str(file)
        articles = article_tokenizer.tokenize(file)

        for article in articles:
            ID = id_tokenizer.tokenize(article)[0]  # Get ID
            contents = content_tokenizer.tokenize(article)  # Get content inside <TEXT> tags
            contents = ' '.join(contents)
            contents = remove_metadata(contents)
            # Tokenize article content, need to get title and body
            tokens = word_tokenize(contents)
            tokens = remove_punctuation(tokens)
            # Add ID to sub_corpus element
            sub_corpus.append([ID])
            article_tokens = []
            for token in tokens:
                token_count += 1
                if token_count < 10000:
                    article_tokens.append(token)
                else:
                    sub_corpus[int(ID) - 1].append(article_tokens)
                    raise StopIteration
            sub_corpus[int(ID) - 1].append(article_tokens)
    except IOError:
        print("Error: File does not exist")
    except StopIteration:
        print("Reached 10,000 tokens")
        break

Reached 10,000 tokens


In [12]:
print(sub_corpus)

[['1', ['BAHIA', 'COCOA', 'REVIEW', 'SALVADOR', 'Feb', '26', 'Showers', 'continued', 'throughout', 'the', 'week', 'in', 'the', 'Bahia', 'cocoa', 'zone', 'alleviating', 'the', 'drought', 'since', 'early', 'January', 'and', 'improving', 'prospects', 'for', 'the', 'coming', 'temporao', 'although', 'normal', 'humidity', 'levels', 'have', 'not', 'been', 'restored', 'Comissaria', 'Smith', 'said', 'in', 'its', 'weekly', 'review', 'The', 'dry', 'period', 'means', 'the', 'temporao', 'will', 'be', 'late', 'this', 'year', 'Arrivals', 'for', 'the', 'week', 'ended', 'February', '22', 'were', '155,221', 'bags', 'of', '60', 'kilos', 'making', 'a', 'cumulative', 'total', 'for', 'the', 'season', 'of', '5.93', 'mln', 'against', '5.81', 'at', 'the', 'same', 'stage', 'last', 'year', 'Again', 'it', 'seems', 'that', 'cocoa', 'delivered', 'earlier', 'on', 'consignment', 'was', 'included', 'in', 'the', 'arrivals', 'figures', 'Comissaria', 'Smith', 'said', 'there', 'is', 'still', 'some', 'doubt', 'as', 'to', '

## Naive indexer code

In [13]:
f = []

# Start timer
start_time = time.time()

# Get doc,ID pairs

for i in range(len(sub_corpus)):
    for token in sub_corpus[i][1]:
        f.append((sub_corpus[i][0], token))

# Sort and remove duplicates
f.sort(key=lambda x: int(x[0]))
f = list(dict.fromkeys(f))
index_naive = {}
for pair in f:
    # Pair[0] is docID and pair[1] is word
    if pair[1] in index_naive:
        index_naive[pair[1]].append(pair[0])
    else:
        index_naive[pair[1]] = [pair[0]]
# End timer
print(f"Naive indexer took: {(time.time() - start_time)} seconds")

Naive indexer took: 0.020310163497924805 seconds


## SPIMI indexer

In [14]:
index_spimi = {}

# Start timer
start_time = time.time()
token_count = 0

for article in sub_corpus:
    ID = article[0]
    for token in article[1]:
        if token in index_spimi:
            if not ID in index_spimi[token]:
                index_spimi[token].append(ID)
        else:
            index_spimi[token] = [ID]
# End timer
print(f"SPIMI indexer took: {(time.time() - start_time)} seconds")

SPIMI indexer took: 0.011925220489501953 seconds


## Create inverted index without compression techniques

### Create corpus of reuters in the form (ID,tokens)

In [15]:
reuters_corpus = []
index_corpus = {}

for i in range(22):
    doc = f"./reuters21578/reut2-0{i:02d}.sgm"
    try:
        if i != 17:
            with open(doc, 'rt') as file:
                file = file.read()
        else:
            # Needed as file 17 gave me UnicodeDecodeError
            file = open(doc, mode="rb")
            file = file.read()
            file = str(file)
        articles = article_tokenizer.tokenize(file)
        for article in articles:
            ID = id_tokenizer.tokenize(article)[0]  # Get ID
            contents = content_tokenizer.tokenize(article)  # Get content inside <TEXT> tags
            contents = ' '.join(contents)
            contents = remove_metadata(contents)
            tokens = word_tokenize(contents)
            reuters_corpus.append((ID, tokens))
    except IOError:
        print("Error: File does not exist")

### Indexing

In [16]:
# Start timer
start_time = time.time()
for article in reuters_corpus:
    ID = article[0]
    tokens = set(article[1])
    for token in tokens:
        if token in index_corpus:
            index_corpus[token].append(ID)
        else:
            index_corpus[token] = [ID]
# End timer
print(f"Whole corpus indexer took: {(time.time() - start_time)} seconds")

Whole corpus indexer took: 1.7319340705871582 seconds


# Subproject 2


## Create index with term frequency

In [17]:
inverted_index = {}
doc_len_list = []
for doc in reuters_corpus:
    ID = doc[0]
    doc_len_list.append(len(doc[1]))  # Appends doc length for use in BM25
    tokens = remove_punctuation(doc[1])
    no_duplicates = set(tokens)
    for token in no_duplicates:
        freq = tokens.count(token)
        if token in inverted_index:
            if not ID in inverted_index[token]:
                inverted_index[token][ID] = freq
        else:
            inverted_index[token] = {ID: freq}

In [50]:
avg_doc_len = int(sum(doc_len_list) / len(doc_len_list))
total_doc_num = len(reuters_corpus)


def compute_bm25(tf, df, doc_len, k, b):
    log_part = log(total_doc_num / df)
    numerator = (k + 1) * tf
    denominator = (k * ((1 - b) + b * (doc_len / avg_doc_len))) + tf
    score = log_part * (numerator / denominator)
    return score


# Modes are bm25, AND, OR
def process_query(query, mode):
    query = set(word_tokenize(query))
    results = []
    # OR query
    if mode == "OR":
        results = {}
        for element in query:
            if element in inverted_index:
                for ID, tf in inverted_index[element].items():
                    if ID in results:
                        results[ID] += tf
                    else:
                        results[ID] = tf
        results = sorted(results.items(), key=lambda item: item[1], reverse=True)
        results = [x[0] for x in results]
    elif mode == "AND":
        for element in query:
            if element in inverted_index:
                results.append(list(inverted_index[element].keys()))
        # Find intersection
        if results:
            intersection = set(results[0])
            for x in results[1:]:
                temp_list = set(x) & intersection
                intersection = temp_list
            results = intersection
    elif mode == "BM25":
        k = 1.1
        b = 0.5
        results = {}
        for word in query:
            if word in inverted_index:
                df = len(inverted_index[word])
                for doc in inverted_index[word].keys():
                    tf = inverted_index[word][doc]
                    doc_len = len(reuters_corpus[int(doc) - 1][1])  # len of doc
                    # print(doc)
                    # print(tf)
                    # print(df)
                    # print(doc_len)
                    score = compute_bm25(tf, df, doc_len, k, b)
                    if word in results:
                        results[doc] += score
                    else:
                        results[doc] = score
        results = sorted(results.items(), key=lambda item: item[1], reverse=True)
        results = [x[0] for x in results]
    return results

## Test queries

In [51]:
test_query_a = "Samjens"
print(process_query(test_query_a, "BM25"))  # Returns list of IDs, for OR and BM25 they are ordered by docID

['18071', '17863', '19419', '17837']


In [52]:
test_query_b = "Smith likes play football"
print(process_query(test_query_b, "BM25"))  # Returns list of IDs, for OR and BM25 they are ordered by docID

['1294', '6918', '15592', '3699', '19331', '8927', '13197', '20457', '21090', '11971', '15589', '13908', '1', '19641', '3877', '19698', '11786', '19496', '8991', '5869', '664', '11194', '4923', '16617', '18267', '5711', '7088', '19885', '2669', '6933', '3380', '9885', '2292', '10793', '11561', '6662', '21273', '25', '222', '7232', '18847', '18938', '21200', '17756', '2277', '1252', '2473', '9817', '3798', '16593', '1144', '7946', '9456', '7195', '7822', '2233', '5016', '3941', '15711', '16445', '7373', '12772', '14533', '16430', '15058', '3727', '10116', '15761', '8417', '3705', '3712', '12107', '11600', '9099', '5480', '4391', '32', '55', '3864', '1963', '18452', '3372', '17754', '3682', '18443', '12089', '18167', '4970', '18350', '6401', '12370', '16588', '11141', '2916', '3716', '5468', '9150', '14820', '14904', '8035', '8097', '8110', '8189', '8192', '21080', '11533', '12828', '17721', '17508', '20030', '19539', '5888', '14220', '3943', '13039', '19882', '2735', '16061', '10979', '

In [53]:
test_query_c = "Hong Kong investment firm"
print(process_query(test_query_c, "AND"))  # Returns list of IDs, for OR and BM25 they are ordered by docID

{'223', '12427', '3933', '10291', '960', '9206', '12610', '14888', '12397', '8085', '9717', '11782', '16872', '125'}


In [54]:
test_query_d = "Concordia university"
print(process_query(test_query_d, "OR"))  # Returns list of IDs, for OR and BM25 they are ordered by docID

['9479', '10134', '15075', '19594', '153', '340', '8729', '8763', '10179', '10199', '10346', '12431', '14635', '17428', '18448']


In [55]:
deliverables_queries = ['Democrats’ welfare and healthcare reform policies', 'Drug company bankruptcies', 'George Bush']
f = open("test_queries.txt", "w")
for x in deliverables_queries:
    f.write("Query: ")
    f.write(str(x))
    f.write("BM25 results:")
    f.write(str(process_query(x, "BM25")))
    f.write("OR results:")
    f.write(str(process_query(x, "OR")))
    f.write("AND results:")
    f.write(str(list(process_query(x, "AND"))))
    print("Digestible format")
    print("Query: " + x)
    print("BM25:")
    print(process_query(x, "BM25")[:20])
    print("OR:")
    print(process_query(x, "OR")[:20])
    print("AND:")
    print(list(process_query(x, "AND"))[:20])

f.close()

Digestible format
Query: Democrats’ welfare and healthcare reform policies
BM25:
['21577', '14976', '7467', '219', '9248', '9096', '12326', '1999', '9347', '7806', '13257', '15661', '13671', '9688', '1908', '3551', '18161', '18449', '20890', '3289']
OR:
['9636', '9180', '11224', '7200', '17953', '5589', '12277', '11204', '7135', '6657', '5891', '5230', '5386', '340', '7589', '1999', '3798', '5318', '12806', '6395']
AND:
[]
Digestible format
Query: Drug company bankruptcies
BM25:
['16771', '8209', '7125', '12461', '12808', '2679', '12242', '18072', '14165', '4435', '11553', '3776', '7506', '9546', '3756', '8789', '15257', '18059', '9085', '3151']
OR:
['21251', '1501', '7094', '12277', '18138', '6157', '1718', '1860', '2428', '3287', '5104', '5482', '12917', '15213', '3115', '3242', '5473', '6203', '9376', '11533']
AND:
[]
Digestible format
Query: George Bush
BM25:
['20891', '16824', '8593', '2711', '20860', '20719', '16780', '4853', '4008', '2766', '10400', '6564', '5459', '2733', '8500