# IRS-Inverted Index

In [1]:
import os
import re
from collections import defaultdict

def tokenize(text):
    """
    Convert text to lowercase and extract alphanumeric tokens.
    This simple tokenizer uses a regular expression to grab word characters.
    """
    text = text.lower()
    # Use regex to extract tokens containing letters or digits.
    tokens = re.findall(r'\w+', text)
    return tokens

def build_inverted_index(folder_path):
    """
    Build an inverted index from all .txt files in the given folder.
    
    The inverted index is a dictionary mapping each term found in the text
    to a set of document IDs (assigned sequentially by file processing order).
    """
    inverted_index = defaultdict(set)
    doc_id = 1  # assign document IDs starting at 1

    # Process files in a sorted order for reproducibility.
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read()
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue

            # Tokenize the file content
            tokens = tokenize(text)
            # Add this doc_id to the set for every token found.
            for token in tokens:
                inverted_index[token].add(doc_id)

            doc_id += 1

    return inverted_index

def print_inverted_index(inverted_index):
    """
    Print the inverted index in the specified format.
    
    Format:
      INVERTED INDEX
      ==============
      Format: term: doc1, doc2, ...
      
      <term>: <doc id list>
      ...
    """
    print("INVERTED INDEX")
    print("==============")
    print("Format: term: doc1, doc2, ...\n")

    # Print each term with its document list; sorting tokens for a consistent order.
    for term in sorted(inverted_index.keys()):
        # Convert the set of doc IDs to a sorted list (ascending numeric order).
        doc_ids = sorted(list(inverted_index[term]))
        # Join the list of doc IDs into a comma-separated string.
        doc_ids_str = ", ".join(str(doc_id) for doc_id in doc_ids)
        print(f"{term}: {doc_ids_str}")


In [2]:

if __name__ == "__main__":
    # Input the folder path containing your .txt files.
    folder_path = r'C:\Users\Public\Dev\Ph.D\2nd Semester\CS-675-IRS\Assignments\First\TXT 1-250-1'
    if not os.path.exists(folder_path):
        print("The folder path does not exist. Please check the path.")
    else:
        inverted_index = build_inverted_index(folder_path)
        print_inverted_index(inverted_index)


INVERTED INDEX
Format: term: doc1, doc2, ...

0: 1, 2, 3
00: 2, 3
000: 1
0002: 1
00020: 3
001: 2
00200: 3
00265: 1
003: 2
01: 2
01069: 2
02: 2
020: 2
029: 1
03: 1
03426: 2
03734: 2
04: 1
05: 2
05147: 2
05918: 3
06: 1, 3
07: 2
07332: 3
07781: 2
1: 1, 2, 3
10: 1, 2, 3
100: 2
1000: 1, 3
1002: 2
1005: 2
1007: 2
1008: 2
1016: 2
1024: 1
1024_our: 1
10k: 3
11: 1, 2, 3
1106: 2
11072: 2
1109: 2
111: 3
11208: 1
113: 1, 2
1135: 2
1137: 1
1144: 2
1145: 1, 2, 3
1149: 1
1150: 2
1159: 2
11929: 3
12: 1, 2, 3
121: 1
123: 1
128: 1, 3
1281: 2
1284: 2
13: 1, 2, 3
13487: 3
135: 2
13th: 1
14: 1, 2, 3
1412: 3
1423: 2
146: 2
1488: 2
14899: 3
15: 1, 2, 3
15th: 1
16: 1, 2, 3
1602: 3
1611: 2
1630: 2
16th: 1
16x16: 3
17: 1, 2, 3
171111kysb20200002: 1
1735: 1
1780: 1
18: 1, 2, 3
1802: 2
1804: 2
1809: 2
18653: 2
1893_: 2
19: 1, 2, 3
1907: 3
193: 2
1997: 1
1http: 2
1k: 1
1𝑒: 3
2: 1, 2, 3
20: 1, 2, 3
200: 3
2005: 3
2009: 1
200k: 3
2010: 2, 3
2011: 2
2012: 2
2014: 1, 3
2015: 1
2016: 1, 2, 3
2017: 1, 2, 3
2018: 1, 2, 3

In [3]:
import os
import re
from collections import defaultdict

def tokenize(text):
    """
    Convert text to lowercase and extract alphanumeric tokens.
    Uses a regular expression to extract words (letters and digits).
    """
    text = text.lower()
    tokens = re.findall(r'\w+', text)
    return tokens

def build_inverted_index(folder_path):
    """
    Build an inverted index from all .txt files in the folder.
    
    Each token (term) maps to a set of document IDs (assigned based on the file processing order).
    """
    inverted_index = defaultdict(set)
    doc_id = 1  # Document IDs start at 1

    # Process files in sorted order to ensure reproducibility in doc_id assignment.
    for filename in sorted(os.listdir(folder_path)):
        print(f'Tokenizing file: {filename}')
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read()
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue
            
            tokens = tokenize(text)
            for token in tokens:
                inverted_index[token].add(doc_id)
            
            doc_id += 1
    
    return inverted_index

def store_inverted_index(inverted_index, output_filename):
    """
    Write the inverted index into a file with the specified format.
    
    Format:
      INVERTED INDEX
      ==============
      Format: term: doc1, doc2, ...
      
      <term>: <doc id list>
      ...
    """
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write("INVERTED INDEX\n")
        f.write("==============\n")
        f.write("Format: term: doc1, doc2, ...\n\n")
        
        # Sort tokens for a consistent output order.
        for term in sorted(inverted_index.keys()):
            # Convert document id set to a sorted list.
            doc_ids = sorted(list(inverted_index[term]))
            doc_ids_str = ", ".join(str(id) for id in doc_ids)
            f.write(f"{term}: {doc_ids_str}\n")
    
    print(f"Inverted index stored in '{output_filename}'.")


In [4]:

if __name__ == "__main__":
    # Input the folder path containing your .txt files.
    # folder_path = input("Enter the folder path containing the .txt files: ").strip()
    folder_path = r'C:\Users\Public\Dev\Ph.D\2nd Semester\CS-675-IRS\Assignments\First\TXT 1-250'
    
    if not os.path.exists(folder_path):
        print("The folder path does not exist. Please check the path.")
    else:
        inverted_index = build_inverted_index(folder_path)
        output_file = "inverted_index.txt"  # You can change the output file name if needed.
        store_inverted_index(inverted_index, output_file)


Tokenizing file: 1.txt
Tokenizing file: 10.txt
Tokenizing file: 100.txt
Tokenizing file: 108.txt
Tokenizing file: 109.txt
Tokenizing file: 11.txt
Tokenizing file: 110.txt
Tokenizing file: 111.txt
Tokenizing file: 112.txt
Tokenizing file: 113.txt
Tokenizing file: 114.txt
Tokenizing file: 115.txt
Tokenizing file: 116.txt
Tokenizing file: 117.txt
Tokenizing file: 118.txt
Tokenizing file: 119.txt
Tokenizing file: 12.txt
Tokenizing file: 120.txt
Tokenizing file: 121.txt
Tokenizing file: 122.txt
Tokenizing file: 123.txt
Tokenizing file: 124.txt
Tokenizing file: 125.txt
Tokenizing file: 126.txt
Tokenizing file: 127.txt
Tokenizing file: 128.txt
Tokenizing file: 129.txt
Tokenizing file: 13.txt
Tokenizing file: 130.txt
Tokenizing file: 131.txt
Tokenizing file: 132.txt
Tokenizing file: 133.txt
Tokenizing file: 134.txt
Tokenizing file: 135.txt
Tokenizing file: 136.txt
Tokenizing file: 137.txt
Tokenizing file: 138.txt
Tokenizing file: 139.txt
Tokenizing file: 14.txt
Tokenizing file: 140.txt
Tokeniz

Checking path: C:\Users\Public\Dev\Ph.D\2nd Semester\CS-675-IRS\Assignments\First\TXT 1-250
Path exists? True


In [5]:
import os
import re
from collections import defaultdict

def tokenize(text):
    """
    Convert text to lowercase and extract alphanumeric tokens.
    Uses a regular expression to extract words (letters and digits).
    """
    text = text.lower()
    tokens = re.findall(r'\w+', text)
    return tokens

def get_doc_id_from_filename(filename):
    """
    Extract the numeric part from a filename like '1.txt' or '123.txt'.
    Returns an integer (e.g., 1, 123).
    Assumes the filename is strictly of the form '<number>.txt'.
    """
    # Split off the extension, e.g. "1.txt" -> ("1", ".txt")
    name_part, ext = os.path.splitext(filename)
    # Convert the name_part ("1", "7", etc.) to an integer.
    return int(name_part)  # This will raise ValueError if name_part isn't numeric

def build_inverted_index(folder_path):
    """
    Build an inverted index from all .txt files in the folder,
    using the filename's numeric part as the doc_id (e.g. "7.txt" -> doc_id = 7).
    """
    inverted_index = defaultdict(set)

    # Process files in sorted order so the output is consistent.
    # Sorting ensures "1.txt" comes before "2.txt" < ... < "10.txt", etc.
    # (Though note that string-sorting might place "10.txt" after "1.txt" before "2.txt". 
    #  If you want pure numeric sorting, see the note below.)
    files = sorted(f for f in os.listdir(folder_path) if f.endswith(".txt"))

    for filename in files:
        file_path = os.path.join(folder_path, filename)
        doc_id = get_doc_id_from_filename(filename)  # preserve the file's numeric ID

        # Read the file text
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
        
        # Tokenize
        tokens = tokenize(text)

        # Populate the inverted index
        for token in tokens:
            inverted_index[token].add(doc_id)
    
    return inverted_index

def store_inverted_index(inverted_index, output_filename):
    """
    Write the inverted index into a file with the specified format:
    
    INVERTED INDEX
    ==============
    Format: term: doc1, doc2, ...
    
    <term>: <doc id list>
    ...
    """
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write("INVERTED INDEX\n")
        f.write("==============\n")
        f.write("Format: term: doc1, doc2, ...\n\n")
        
        # Sort tokens for a consistent output order.
        for term in sorted(inverted_index.keys()):
            # Convert the set of doc IDs to a sorted list.
            doc_ids = sorted(list(inverted_index[term]))
            doc_ids_str = ", ".join(str(did) for did in doc_ids)
            f.write(f"{term}: {doc_ids_str}\n")
    
    print(f"Inverted index stored in '{output_filename}'.")


In [7]:

if __name__ == "__main__":
    # folder_path = input("Enter the folder path containing the .txt files: ").strip()
    folder_path = r'C:\Users\Public\Dev\Ph.D\2nd Semester\CS-675-IRS\Assignments\First\TXT 1-250'
    
    if not os.path.exists(folder_path):
        print("The folder path does not exist. Please check the path.")
    else:
        inverted_index = build_inverted_index(folder_path)
        output_file = "inverted_index.txt"
        store_inverted_index(inverted_index, output_file)


Inverted index stored in 'inverted_index.txt'.
