# Inverted Index Generation

In [28]:
import os
import json
import PyPDF2
import re
import nltk
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Download necessary NLTK resources


In [29]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Initialize components

In [30]:
# Initialize components
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [31]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + " "
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

In [32]:
def extract_text_from_txt(txt_path):
    """Extract text from a TXT file."""
    try:
        with open(txt_path, "r", encoding="utf-8") as file:
            return file.read()
    except Exception as e:
        print(f"Error reading {txt_path}: {e}")
        return ""

In [33]:
def clean_text(text):
    """Tokenize, remove stopwords, numbers, and non-alphabetic words, and lemmatize."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\b\d+\b', '', text)  # Remove standalone numbers
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and punctuation
    
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
    
    return words

In [34]:
def build_inverted_index(folder_path):
    """Build an inverted index from files in the folder."""
    inverted_index = defaultdict(set)
    doc_id = 1
    doc_map = {}  # Mapping of doc_id to filenames

    for filename in os.listdir(folder_path):
        print(filename)
        file_path = os.path.join(folder_path, filename)
        
        # Extract text based on file type
        if filename.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif filename.endswith(".txt"):
            text = extract_text_from_txt(file_path)
        else:
            continue  # Skip non-supported file types
        
        # Process the text and update inverted index
        words = clean_text(text)
        for word in words:
            inverted_index[word].add(doc_id)
        
        # Store document mapping
        doc_map[doc_id] = filename
        doc_id += 1
    
    # Convert sets to lists for JSON serialization
    inverted_index = {word: list(docs) for word, docs in inverted_index.items()}

    return inverted_index, doc_map

In [35]:
def save_index_to_json(inverted_index, doc_map, output_file="clean_inverted_index.json"):
    """Save the inverted index and document map to a JSON file."""
    index_data = {"inverted_index": inverted_index, "document_map": doc_map}
    with open(output_file, "w", encoding="utf-8") as json_file:
        json.dump(index_data, json_file, indent=4)

In [38]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [39]:
# if __name__ == "__main__":
    # folder_path = input("Enter the folder path containing the documents: ").strip()
    # folder_path = r"C:\Users\Public\Dev\Ph.D\2nd Semester\CS-675-IRS\Assignments\First\TXT 1-250"
# folder_path = r"C:\Users\Public\Dev\Ph.D\2nd Semester\CS-675-IRS\Assignments\First\PDFs 1-250"
folder_path = r'C:\Users\Public\Dev\Ph.D\2nd Semester\CS-675-IRS\Assignments\First\TXT 1-250'
inverted_index, doc_map = build_inverted_index(folder_path)
print(f'inverted_index: {inverted_index}')
print(f'doc_map: {doc_map}')
save_index_to_json(inverted_index, doc_map)
print(f"Cleaned inverted index saved as 'clean_inverted_index.json'")

1.txt
10.txt
100.txt
108.txt
109.txt
11.txt
110.txt
111.txt
112.txt
113.txt
114.txt
115.txt
116.txt
117.txt
118.txt
119.txt
12.txt
120.txt
121.txt
122.txt
123.txt
124.txt
125.txt
126.txt
127.txt
128.txt
129.txt
13.txt
130.txt
131.txt
132.txt
133.txt
134.txt
135.txt
136.txt
137.txt
138.txt
139.txt
14.txt
140.txt
141.txt
142.txt
15.txt
151.txt
152.txt
153.txt
154.txt
155.txt
156.txt
157.txt
16.txt
17.txt
173.txt
174.txt
175.txt
176.txt
177.txt
178.txt
179.txt
18.txt
180.txt
188.txt
189.txt
19.txt
190.txt
191.txt
192.txt
193.txt
194.txt
195.txt
196.txt
197.txt
198.txt
199.txt
2.txt
20.txt
200.txt
21.txt
214.txt
215.txt
216.txt
217.txt
218.txt
219.txt
22.txt
220.txt
221.txt
222.txt
223.txt
224.txt
225.txt
226.txt
227.txt
228.txt
229.txt
23.txt
230.txt
231.txt
232.txt
233.txt
234.txt
235.txt
236.txt
237.txt
238.txt
24.txt
25.txt
26.txt
27.txt
28.txt
29.txt
3.txt
30.txt
31.txt
32.txt
33.txt
34.txt
35.txt
36.txt
37.txt
38.txt
39.txt
4.txt
40.txt
41.txt
42.txt
43.txt
44.txt
45.txt
46.txt
47.tx

Checking path: C:\Users\Public\Dev\Ph.D\2nd Semester\CS-675-IRS\Assignments\First\TXT 1-250
Path exists? True


In [25]:
print("Items in folder:", os.listdir(folder_path))

Items in folder: ['1.txt', '10.txt', '100.txt', '108.txt', '109.txt', '11.txt', '110.txt', '111.txt', '112.txt', '113.txt', '114.txt', '115.txt', '116.txt', '117.txt', '118.txt', '119.txt', '12.txt', '120.txt', '121.txt', '122.txt', '123.txt', '124.txt', '125.txt', '126.txt', '127.txt', '128.txt', '129.txt', '13.txt', '130.txt', '131.txt', '132.txt', '133.txt', '134.txt', '135.txt', '136.txt', '137.txt', '138.txt', '139.txt', '14.txt', '140.txt', '141.txt', '142.txt', '15.txt', '151.txt', '152.txt', '153.txt', '154.txt', '155.txt', '156.txt', '157.txt', '16.txt', '17.txt', '173.txt', '174.txt', '175.txt', '176.txt', '177.txt', '178.txt', '179.txt', '18.txt', '180.txt', '188.txt', '189.txt', '19.txt', '190.txt', '191.txt', '192.txt', '193.txt', '194.txt', '195.txt', '196.txt', '197.txt', '198.txt', '199.txt', '2.txt', '20.txt', '200.txt', '21.txt', '214.txt', '215.txt', '216.txt', '217.txt', '218.txt', '219.txt', '22.txt', '220.txt', '221.txt', '222.txt', '223.txt', '224.txt', '225.txt'