In [2]:
import pandas as pd
from collections import defaultdict
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import json
import gzip
import pickle 
import numpy as np
from array import array
from datetime import datetime

# Code for proprocessing of text 
### Used for tokenization and other preprocessing

In [3]:
#for stopword removal
nltk.download('stopwords')
stop = set(stopwords.words('english') + list(string.punctuation))


# Function to tokenize and preprocess a document
def preprocess(text):
    tokens = set(word_tokenize(text.lower())) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords

    # add lematization later

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Creation of ininverted index

In [4]:
try:
    import orjson as _json
    loads = _json.loads
except Exception:
    import json as _json
    loads = _json.loads

In [None]:
OUTPUT_PATH = "inverted_index.jsonl"

#create terms used in each document
#extend with count
corpus = 'trec-tot-2025-corpus.jsonl.gz'

print("Starting with the creation of the inverted index (6,407,814 entries)") # based on the number on the website

print(datetime.now())

inverted_index = defaultdict(lambda: array('I'))
with gzip.open(corpus, 'rt', encoding='utf-8') as f_in:
    for line_number, line in enumerate(f_in, start=1):
        line = line.strip()
        if not line:
            continue  # skip empty lines
        try:
            data = loads(line)
            text = data.get("text", "")
            tokens = preprocess(text)
            tokens = set(tokens)                         #one for all later ad counts as well
            for token in tokens:
                inverted_index[token].append(np.int32(line_number))
        except json.JSONDecodeError as e:
            print(f"[Line {line_number}] Invalid JSON: {e}")

        if line_number % 10000 == 0: # ginve information of progress and saving it to a file
            print(line_number)
        if line_number % 100000 == 0:
            with open('inverted_index.pkl', 'wb') as f_out:
                inverted_index_np = {term: np.array(ids, dtype=np.int32) for term, ids in inverted_index.items()}
                pickle.dump(inverted_index_np, f_out)
                f_out.flush()
    inverted_index_np = {
        term: np.array(ids, dtype=np.int32)
        for term, ids in inverted_index.items()
    }   
    with open('inverted_index.pkl', 'wb') as f_out:
        pickle.dump(dict(inverted_index_np), f_out)
        f_out.flush()

print(f"Finished creation of the inverted index.")
print(datetime.now())

Starting with the creation of the inverted index (6,407,814 entries)
2025-10-24 23:54:01.632149
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270

In [None]:
import gzip, pickle, json, os, sys
from collections import defaultdict
from array import array
from datetime import datetime
from pathlib import Path
import numpy as np

# === Pfade & Settings ===
CORPUS_PATH = Path("trec-tot-2025-corpus.jsonl.gz")
PICKLE_PATH = Path("inverted_index.pkl")         # dein Zwischenspeicher
PROGRESS_PATH = Path("inverted_index.progress")  # optionales Sidecar
SAVE_EVERY = 100_000                             # wie bisher
PRINT_EVERY = 10_000

def load_checkpoint():
    """
    Lädt (falls vorhanden) das Pickle und rekonstruiert:
      - inverted_index: defaultdict(lambda: array('I')) (mutierbar)
      - last_done: höchste gesehene Zeilennummer
    """
    inverted_index = defaultdict(lambda: array('I'))
    last_done = 0

    if PICKLE_PATH.exists():
        with open(PICKLE_PATH, "rb") as f:
            inv_np = pickle.load(f)  # dict[str, np.ndarray(dtype=int32)]
        # zurück in array('I') zum effizienten Append
        for term, arr in inv_np.items():
            # arr ist np.ndarray[int32]; vonlist ist am einfachsten
            inverted_index[term].fromlist(arr.tolist())
            if arr.size:
                m = int(arr.max())
                if m > last_done:
                    last_done = m

    # Falls ein Sidecar existiert, nimm das Maximum aus beiden (extra robust)
    if PROGRESS_PATH.exists():
        try:
            prog = json.loads(PROGRESS_PATH.read_text())
            last_done = max(last_done, int(prog.get("last_line", 0)))
        except Exception:
            pass

    return inverted_index, last_done

def save_checkpoint(inverted_index, last_line):
    """Speichert atomisch: Pickle + Fortschritt."""
    # 1) dict[str, np.ndarray[int32]] bauen
    inv_np = {t: np.frombuffer(v.tobytes(), dtype=np.uint32).astype(np.int32, copy=False)
              for t, v in inverted_index.items()}

    # 2) Pickle atomisch schreiben
    tmp_pkl = PICKLE_PATH.with_suffix(".pkl.tmp")
    with open(tmp_pkl, "wb") as f:
        pickle.dump(inv_np, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.flush()
        os.fsync(f.fileno())
    os.replace(tmp_pkl, PICKLE_PATH)

    # 3) Fortschritt atomisch schreiben
    tmp_prog = PROGRESS_PATH.with_suffix(".progress.tmp")
    with open(tmp_prog, "w", encoding="utf-8") as f:
        json.dump({"last_line": int(last_line), "saved_at": datetime.now().isoformat()}, f)
        f.flush()
        os.fsync(f.fileno())
    os.replace(tmp_prog, PROGRESS_PATH)


# === Weiter-/Neu-Lauf ===
print("Starte (Weiter-)Erstellung des Inverted Index …")
print(datetime.now())

inverted_index, last_done = load_checkpoint()
print(f"Weiter ab Zeile: {last_done + 1}")

# Du nutzt orjson/json.loads an anderer Stelle – hier exemplarisch:
try:
    import orjson as _json
    loads = _json.loads
except Exception:
    import json as _json
    loads = _json.loads

processed = 0
start_line = last_done + 1

with gzip.open(corpus, 'rt', encoding='utf-8') as f_in:
    for line_number, line in enumerate(f_in, start=1):
        if line_number <= last_done:
            continue
        line = line.strip()
        if not line:
            continue  # skip empty lines
        try:
            data = loads(line)
            text = data.get("text", "")
            tokens = preprocess(text)
            tokens = set(tokens)                         #one for all later ad counts as well
            for token in tokens:
                inverted_index[token].append(np.int32(line_number))
        except json.JSONDecodeError as e:
            print(f"[Line {line_number}] Invalid JSON: {e}")

        if line_number % 10000 == 0: # ginve information of progress and saving it to a file
            print(line_number)
        if line_number % 100000 == 0:
            with open('inverted_index.pkl', 'wb') as f_out:
                inverted_index_np = {term: np.array(ids, dtype=np.int32) for term, ids in inverted_index.items()}
                pickle.dump(inverted_index_np, f_out)
                f_out.flush()
    inverted_index_np = {
        term: np.array(ids, dtype=np.int32)
        for term, ids in inverted_index.items()
    }
    with open('inverted_index.pkl', 'wb') as f_out:
        pickle.dump(dict(inverted_index_np), f_out)
        f_out.flush()

Starte (Weiter-)Erstellung des Inverted Index …
2025-10-24 23:50:31.253880
Weiter ab Zeile: 700001
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000


KeyboardInterrupt: 

# Opening the inverted index

In [3]:
with open('inverted_index.pkl', 'rb') as f:
    inverted_index = pickle.load(f)

In [4]:
print(len(inverted_index))

17335873


# Convert to db

In [10]:
import shelve

counter = 0
with shelve.open("inverted_index.db") as db:
    for term, posting_list in inverted_index.items():
        counter += 1
        if counter % 50000 == 0:
            print(counter)
        db[term] = posting_list
print("fertig")

50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000
1250000
1300000
1350000
1400000
1450000
1500000
1550000
1600000
1650000
1700000
1750000
1800000
1850000
1900000
1950000
2000000
2050000
2100000
2150000
2200000
2250000
2300000
2350000
2400000
2450000
2500000
2550000
2600000
2650000
2700000
2750000
2800000
2850000
2900000
2950000
3000000
3050000
3100000
3150000
3200000
3250000
3300000
3350000
3400000
3450000
3500000
3550000
3600000
3650000
3700000
3750000
3800000
3850000
3900000
3950000
4000000
4050000
4100000
4150000
4200000
4250000
4300000
4350000
4400000
4450000
4500000
4550000
4600000
4650000
4700000
4750000
4800000
4850000
4900000
4950000
5000000
5050000
5100000
5150000
5200000
5250000
5300000
5350000
5400000
5450000
5500000
5550000
5600000
5650000
5700000
5750000
5800000
5850000
5900000
5950000
6000000
6050000
6100000
6150000
6200000
6250000
6300000
6350000
6400

In [8]:
term = "sciences"
with shelve.open("inverted_index.db") as db:
    print(db[term])
    keys = list(db.keys())
    print(f"Number of terms: {len(keys)}")
    print("First 20 keys:", keys[:20])

[      4       5       7 ... 6407781 6407783 6407793]
Number of terms: 99999
First 20 keys: ["''", "'-teen", "'.880", "'.ao", "'/", "'24-hour", "'2s", "'30s", "'31", "'32", "'39", "'70", "'80–240", "'\\n", "'abba", "'abd", "'abraxas", "'academic", "'accessing", "'act"]
