In [57]:
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
import time

## Naive indexer code

In [58]:
content_tokenizer = RegexpTokenizer('<TEXT.*?>(.*?)</TEXT>')
article_tokenizer = RegexpTokenizer('<REUTERS(.*?)</REUTERS>')
id_tokenizer = RegexpTokenizer('NEWID="(.*?)"')
metadata_tokenizer = RegexpTokenizer('<.*?>')
html_entities_tokenizer = RegexpTokenizer('^&.*?;')
postings_list = {}
f = []
reuters_corpus = []
token_count = 0


def remove_metadata(text):
    tags = metadata_tokenizer.tokenize(text)
    html_entities = html_entities_tokenizer.tokenize(text)
    metadata = tags + html_entities
    if len(metadata) > 0:
        for element in metadata:
            text = text.replace(element, '')
    return text


def remove_punctuation(tokens):
    punctuation_list = [*".,:;-<>{}()[]~`"]
    double_symbols = ['""', "''", "``", "..."]
    punctuation_list += double_symbols
    for t in tokens:
        if t in punctuation_list:
            tokens.remove(t)
    return tokens


# Start timer
start_time = time.time()

for i in range(22):
    doc = f"./reuters21578/reut2-0{i:02d}.sgm"
    try:
        if i != 17:
            with open(doc, 'rt') as file:
                file = file.read()
        else:
            # Needed as file 17 gave me UnicodeDecodeError
            file = open(doc, mode="rb")
            file = file.read()
            file = str(file)
        articles = article_tokenizer.tokenize(file)

        for article in articles:
            ID = id_tokenizer.tokenize(article)[0]  # Get ID
            contents = content_tokenizer.tokenize(article)  # Get content inside <TEXT> tags
            contents = ' '.join(contents)
            contents = remove_metadata(contents)
            # Tokenize article content, need to get title and body
            tokens = word_tokenize(contents)
            tokens = remove_punctuation(tokens)
            for token in tokens:
                token_count += 1
                if token_count == 10000:
                    raise StopIteration
                f.append((ID, token))
    except IOError:
        print("Error: File does not exist")
    except StopIteration:
        print("Reached 10,000 tokens")
        break

# Sort and remove duplicates
f.sort(key=lambda x: int(x[0]))
f = list(dict.fromkeys(f))
index_naive = {}
for pair in f:
    # Pair[0] is docID and pair[1] is word
    if pair[1] in index_naive:
        index_naive[pair[1]].append(pair[0])
    else:
        index_naive[pair[1]] = [pair[0]]

# End timer
print(f"Naive indexer took: {(time.time() - start_time)} seconds")

Reached 10,000 tokens
Naive indexer took: 0.5081589221954346 seconds


## SPIMI indexer

In [59]:
reuters_corpus = []
token_count = 0
index_spimi = {}

# Start timer
start_time = time.time()

for i in range(22):
    doc = f"./reuters21578/reut2-0{i:02d}.sgm"
    try:
        if i != 17:
            with open(doc, 'rt') as file:
                file = file.read()
        else:
            # Needed as file 17 gave me UnicodeDecodeError
            file = open(doc, mode="rb")
            file = file.read()
            file = str(file)
        articles = article_tokenizer.tokenize(file)

        for article in articles:
            ID = id_tokenizer.tokenize(article)[0]  # Get ID
            contents = content_tokenizer.tokenize(article)  # Get content inside <TEXT> tags
            contents = ' '.join(contents)
            contents = remove_metadata(contents)
            # Tokenize article content, need to get title and body
            tokens = word_tokenize(contents)
            tokens = remove_punctuation(tokens)
            for token in tokens:
                token_count += 1
                if token_count == 10000:
                    raise StopIteration
                if token in index_spimi and not ID in index_spimi[token]:
                    index_spimi[token].append(ID)
                else:
                    index_spimi[token] = [ID]

    except IOError:
        print("Error: File does not exist")
    except StopIteration:
        print("Reached 10,000 tokens")
        break

# End timer
print(f"SPIMI indexer took: {(time.time() - start_time)} seconds")

Reached 10,000 tokens
SPIMI indexer took: 0.16207098960876465 seconds


## Create inverted index without compression techniques

In [62]:
reuters_corpus = []
index_spimi = {}

# Start timer
start_time = time.time()

for i in range(22):
    doc = f"./reuters21578/reut2-0{i:02d}.sgm"
    try:
        if i != 17:
            with open(doc, 'rt') as file:
                file = file.read()
        else:
            # Needed as file 17 gave me UnicodeDecodeError
            file = open(doc, mode="rb")
            file = file.read()
            file = str(file)
        articles = article_tokenizer.tokenize(file)

        for article in articles:
            ID = id_tokenizer.tokenize(article)[0]  # Get ID
            contents = content_tokenizer.tokenize(article)  # Get content inside <TEXT> tags
            contents = ' '.join(contents)
            contents = remove_metadata(contents)
            # Tokenize article content, need to get title and body
            tokens = word_tokenize(contents)
            tokens = remove_punctuation(tokens)
            for token in tokens:
                if token in index_spimi and not ID in index_spimi[token]:
                    index_spimi[token].append(ID)
                else:
                    index_spimi[token] = [ID]

    except IOError:
        print("Error: File does not exist")

# End timer
print(f"Whole corpus indexer took: {(time.time() - start_time)} seconds")

Whole corpus indexer took: 28.37154483795166 seconds
