# Inverted Index

In [3]:
# data\tot25\subsets\train80\train100k-corpus.jsonl.gz
# train corpus directory
# evaluation code in Models/posting_list_operations.ipynb

In [3]:
import pandas as pd
from collections import defaultdict, Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import json
import gzip
import pickle 
import numpy as np
from array import array
from datetime import datetime
import shelve
import math

## Preprocessing

In [2]:
#for stopword removal
nltk.download('stopwords')
stop = set(stopwords.words('english') + list(string.punctuation))


# Function to tokenize and preprocess a document
def preprocess(text):
    tokens = set(word_tokenize(text.lower())) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Inverted  Index Creation

In [None]:
try:
    import orjson as _json
    loads = _json.loads
except Exception:
    import json as _json
    loads = _json.loads
    
OUT_PATH = "./data/inverted_index.db"
corpus = "../data/tot25/subsets/train80/train100k-corpus.jsonl.gz"

UNSIGNED_INTEGER_MAX = 2**32 - 1 # max 32 bit unsigned integer size

counter = 0
inverted_index = defaultdict(lambda:array('I'))
#r --> read, t --> text not bytes
with gzip.open(corpus, "rt", encoding = "utf8") as f_in:
    for line_number, line in enumerate(f_in, start = 1):
        counter += 1
        line = line.strip()
        data = loads(line)
        id = int(data.get("id"))
        #safety check to see whether everything can be stored as 32 bit unsigned integer (else need Q parameter for defaultdict parameter)
        if not UNSIGNED_INTEGER_MAX > id > 0:
            raise OverflowError("Invalid ID for 32 bit unsigned integer, need 64 bit") 
        tokens = set(preprocess(data.get("text")))
        for token in tokens:
            inverted_index[token].append(int(data.get("id", "")))
        if counter % 5_000 == 0:
            print(f"read {counter} lines")
# 30 mins

read 5000 lines
read 10000 lines
read 15000 lines
read 20000 lines
read 25000 lines
read 30000 lines
read 35000 lines
read 40000 lines
read 45000 lines
read 50000 lines
read 55000 lines
read 60000 lines
read 65000 lines
read 70000 lines
read 75000 lines
read 80000 lines
read 85000 lines
read 90000 lines
read 95000 lines
read 100000 lines


In [None]:
print(len(inverted_index))
counter = 0
with shelve.open(OUT_PATH) as db:
    for term, posting_list in inverted_index.items():
        db[term] = posting_list
        counter += 1
        if counter % 5_000 == 0:
            print(f"processed {counter} words")
    db.sync()

#28 mins

1572117
processed 5000 words
processed 10000 words
processed 15000 words
processed 20000 words
processed 25000 words
processed 30000 words
processed 35000 words
processed 40000 words
processed 45000 words
processed 50000 words
processed 55000 words
processed 60000 words
processed 65000 words
processed 70000 words
processed 75000 words
processed 80000 words
processed 85000 words
processed 90000 words
processed 95000 words
processed 100000 words
processed 105000 words
processed 110000 words
processed 115000 words
processed 120000 words
processed 125000 words
processed 130000 words
processed 135000 words
processed 140000 words
processed 145000 words
processed 150000 words
processed 155000 words
processed 160000 words
processed 165000 words
processed 170000 words
processed 175000 words
processed 180000 words
processed 185000 words
processed 190000 words
processed 195000 words
processed 200000 words
processed 205000 words
processed 210000 words
processed 215000 words
processed 220000 words


Quick sanity check

In [10]:
INVERTED_INDEX_PATH = "./data/inverted_index.db"

with shelve.open(INVERTED_INDEX_PATH, flag = "r") as db:
    print(f"number of terms: {len(db)}")
    print(f"documents containing mannheim: {db["mannheim"]}")

number of terms: 1572117
documents containing mannheim: array('I', [700, 4407019, 4577884, 5814879, 6253823, 6272189, 6343860, 7037805, 8132208, 8250877, 8351187, 8710621, 11210342, 11210433, 11210473, 11210507, 11210643, 11210745, 11451764, 11981133, 12037783, 12325101, 12556186, 12989055, 14956021, 15584759, 16006531, 17997957, 19218099, 20071648, 20435694, 20865031, 22145651, 23248722, 23643764, 23672923, 24355525, 24449669, 25272420, 26281786, 27995863, 50564, 56166, 58865, 62654, 63742, 68322, 73419, 78747, 81586, 86351, 88817, 95184, 102198, 129332, 144135, 147687, 31996140, 33244287, 33792780, 35313233, 36029833, 37909853, 37918094, 39246316, 43128406, 44348411, 44542304, 50109950, 50294920, 57311508, 59439318, 60797380, 65562716, 66250007, 67041387, 67278971, 68500305, 70375857, 72304317, 72535805, 164640, 177032, 184843, 184846, 184847, 187709, 248551, 276356, 369155, 379216, 492132, 537560, 980439, 1191147, 1319121, 1522205, 1544929, 1866112, 1971360, 2150754, 2197440, 228174

In [15]:
with gzip.open(corpus) as f:
    for line in f:
        line = line.strip()
        data = loads(line)
        id = int(data.get("id"))
        if id == 700: 
            if "Mannheim" in data.get("text"):
                print("passed test")
            print(data)
            break
        if id > 10000:
            print("failed test")
            break

passed test
{'id': '700', 'url': 'https://en.wikipedia.org/wiki/Arthur%20Schopenhauer', 'title': 'Arthur Schopenhauer', 'text': 'Arthur Schopenhauer ( , ; 22 February 1788\xa0– 21 September 1860) was a German philosopher. He is best known for his 1818 work The World as Will and Representation (expanded in 1844), which characterizes the phenomenal world as the manifestation of a blind and irrational noumenal will. Building on the transcendental idealism of Immanuel Kant (1724–1804), Schopenhauer developed an atheistic metaphysical and ethical system that rejected the contemporaneous ideas of German idealism. He was among the first thinkers in Western philosophy to share and affirm significant tenets of Indian philosophy, such as asceticism, denial of the self, and the notion of the world-as-appearance. His work has been described as an exemplary manifestation of philosophical pessimism. Though his work failed to garner substantial attention during his lifetime, Schopenhauer had a posthu

## TF-IDF

In [4]:
#for stopword removal
nltk.download('stopwords')
stop = set(stopwords.words('english') + list(string.punctuation))


# Function to tokenize and preprocess a document
def preprocess_list(text):
    tokens = list(word_tokenize(text.lower())) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


creating term frequencies

In [None]:
try:
    import orjson as _json
    loads = _json.loads
except Exception:
    import json as _json
    loads = _json.loads

OUT_PATH_TF_IDF = "./data/tf_idf.db"
corpus = "../data/tot25/subsets/train80/train100k-corpus.jsonl.gz"

term_postings = defaultdict(Counter) # here we map each term to a counter map doc_id \mapsto tf

counter = 0
with gzip.open(corpus, "rt", encoding = "utf8") as f_in:
    for line_number, line in enumerate(f_in, start = 1):
        counter += 1
        data = loads(line.strip())
        id = int(data.get("id"))
        # the upper preprocess function returns each token only once, hence we adapt to another one
        tokens = preprocess_list(data.get("text"))
        tf_counts = Counter(tokens)
        for term, tf in tf_counts.items():
            term_postings[term][id] = tf
        
        if counter % 5_000 == 0:
            print(f"processed {counter} lines")
    
# 23 min

processed 5000 lines
processed 10000 lines
processed 15000 lines
processed 20000 lines
processed 25000 lines
processed 30000 lines
processed 35000 lines
processed 40000 lines
processed 45000 lines
processed 50000 lines
processed 55000 lines
processed 60000 lines
processed 65000 lines
processed 70000 lines
processed 75000 lines
processed 80000 lines
processed 85000 lines
processed 90000 lines
processed 95000 lines
processed 100000 lines


from term frequencies creating idf and storing everything in db

In [None]:
TOTAL_DOCUMENTS = 100_000 # from 
with shelve.open(OUT_PATH_TF_IDF) as db:
    for term, posting in term_postings.items():
        # note here counter is the map for document to frequency, not a counter in terms of how many iterations have been executed!
        df = len(posting)
        idf = math.log(TOTAL_DOCUMENTS / df)
        #  since we sort beforehand the document id and tfs are now sorted the same way as well and belong together with the same index :)
        items = sorted(posting.items())
        doc_ids = array("I", (doc for doc, _ in items))
        # note: for the doc_ids we use I (32 bit integer), as they account for the max range of the document ids and H (16 bit) with a maximum of 65536 (2^16 - 1) is not big enough
        # for the term frequencies, since no wikipedia abstract contains nearly as many tokens as that, we use H
        # uppercase means unsigned, lower case means signed!
        tfs = array('H', (tf for _, tf in items))
        db[term] = {"df": df, "idf": idf, "doc_ids": doc_ids, "tfs": tfs}
    db.sync()
    
# 20 mins

Quick sanity check

In [18]:
TF_IDF_PATH = "./data/tf_idf.db"

with shelve.open(TF_IDF_PATH, flag = "r") as db:
    print(f"number of terms: {len(db)}")
    results_mannheim = db["mannheim"]
    print(results_mannheim)
    if 700 in results_mannheim.get("doc_ids"):
        print("passed")
    else:
        print("failed test")

number of terms: 1572117
{'df': 107, 'idf': 6.840096630508322, 'doc_ids': array('I', [700, 50564, 56166, 58865, 62654, 63742, 68322, 73419, 78747, 81586, 86351, 88817, 95184, 102198, 129332, 144135, 147687, 164640, 177032, 184843, 184846, 184847, 187709, 248551, 276356, 369155, 379216, 492132, 537560, 980439, 1191147, 1319121, 1522205, 1544929, 1866112, 1971360, 2150754, 2197440, 2281745, 2290662, 2547575, 2629664, 2783571, 4407019, 4577884, 5814879, 6253823, 6272189, 6343860, 7037805, 8132208, 8250877, 8351187, 8710621, 11210342, 11210433, 11210473, 11210507, 11210643, 11210745, 11451764, 11981133, 12037783, 12325101, 12556186, 12989055, 14956021, 15584759, 16006531, 17997957, 19218099, 20071648, 20435694, 20865031, 22145651, 23248722, 23643764, 23672923, 24355525, 24449669, 25272420, 26281786, 27995863, 31996140, 33244287, 33792780, 35313233, 36029833, 37909853, 37918094, 39246316, 43128406, 44348411, 44542304, 50109950, 50294920, 57311508, 59439318, 60797380, 65562716, 66250007, 670