In [None]:
#NLTK Initialization and Setup
!pip install -U nltk
!pip install emoji
!python -m nltk.downloader punkt_tab

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import os
import string
import logging
from collections import defaultdict , Counter

**Loading_Text_Files**

In [None]:
import os
import logging

# Function to load .txt files from a specified folder
def load_text_files(folder_path):
    """
    Reads all .txt files in a folder and returns a
    dictionary with filenames as keys and content as values.
    """
    data = {}  # Dictionary to store document content with doc_id as key
    doc_id_to_filename = {}  # Dictionary to map doc_id to filename
    doc_id = 0  # Initialize document ID

    # Iterate over files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Process only .txt files
            file_path = os.path.join(folder_path, filename)

            # Open and read the file content
            with open(file_path, 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()  # Store the content of the file
                doc_id_to_filename[doc_id] = filename  # Map doc_id to filename

                # Log the loading of the file
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")

            doc_id += 1  # Increment document ID for the next file

    # Return the dictionary containing file content and the mapping of doc_ids to filenames
    return data, doc_id_to_filename

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)

# Specify the folder path
folder_path = '/content/drive/MyDrive/Documents' # This is the directory containing the files

# Load the text files
document_data, doc_id_to_filename = load_text_files(folder_path)

# Example to access the data
print(document_data)
print(doc_id_to_filename)

{0: 'Python Data Types: \nEvery value has a datatype, and variables can hold values. Python is a powerfully composed language; consequently, we don\'t have to characterize the sort of variable while announcing it. The interpreter binds the value implicitly to its type.\n\na = 5  \nWe did not specify the type of the variable a, which has the value five from an integer. The Python interpreter will automatically interpret the variable as an integer.\n\nWe can verify the type of the program-used variable thanks to Python. The type() function in Python returns the type of the passed variable.\n\nConsider the following illustration when defining and verifying the values of various data types.\nADVERTISEMENT\n\na=10  \nb="Hi Python"  \nc = 10.5  \nprint(type(a))  \nprint(type(b))  \nprint(type(c))  \nOutput:\n\n<type \'int\'>\n<type \'str\'>\n<type \'float\'>\nStandard data types\nA variable can contain a variety of values. On the other hand, a person\'s id must be stored as an integer, while

**Text Cleaning Pipeline**

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer and stopwords
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """Performs text cleaning: removing special characters, numbers, tokenization, stopword removal, and lemmatization."""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers using regular expressions
    # Keeps only alphabetic characters and spaces
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove mentions (@usernames)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    # Retain apostrophes in contractions, remove other special characters
    text = re.sub(r"[^a-zA-Z\s']", '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Convert emojis to descriptive text
    text = emoji.demojize(text)

    # Convert tokens to lowercase
    tokens_lower = [token.lower() for token in tokens]

    # Manually preserve contractions to ensure they're not split
    contractions = ["n't", "'m", "'re", "'ve", "'ll", "'s", "'d"]
    tokens_preserved = []
    for token in tokens_lower:
        # Reattach contraction endings if separated
        if token in contractions and tokens_preserved:
            tokens_preserved[-1] += token
        else:
            tokens_preserved.append(token)

    # Step 4: Remove stopwords and lemmatize
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]

    return cleaned_tokens

# Example usage:
documents = [
    "Doc_1: Python Variables: A variable is the name given to a memory location. It has 5 types.",
    "Doc_2: Python Data Types: Every value has a datatype, and variables can hold values. Numbers like 12345 are removed.",
    "Doc_3: Python Literals: Python Literals can be defined as data that is given in a variable or constant like 789.",
    "Doc_4: Python Tuples: A comma-separated group of items is called a Python triple with 3 values.",
    "Doc_5: Python Arrays: The Array is used in every programming language like C, C++, Java, Python, R, JavaScript."
]

cleaned_documents = [clean_text(doc) for doc in documents]
print(cleaned_documents)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[['doc', 'python', 'variable', 'variable', 'name', 'given', 'memory', 'location', 'type'], ['doc', 'python', 'data', 'type', 'every', 'value', 'datatype', 'variable', 'hold', 'value', 'number', 'like', 'removed'], ['doc', 'python', 'literal', 'python', 'literal', 'defined', 'data', 'given', 'variable', 'constant', 'like'], ['doc', 'python', 'tuples', 'commaseparated', 'group', 'item', 'called', 'python', 'triple', 'value'], ['doc', 'python', 'array', 'array', 'used', 'every', 'programming', 'language', 'like', 'c', 'c', 'java', 'python', 'r', 'javascript']]


**Construction of Inverted Index**

In [None]:
from collections import defaultdict, Counter
import re
import os

def clean_text(text):
    # Convert to lowercase and remove special characters using regular expressions
    return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower()).split()

def build_inverted_index(data):

    inverted_index = defaultdict(set)  # Initialize an empty inverted index
    term_frequencies = Counter()  # Initialize a counter for term frequencies

    for doc_id, content in data.items():
        cleaned_tokens = clean_text(content)  # Clean the text content

        for token in cleaned_tokens:
            inverted_index[token].add(doc_id)  # Add the document ID to the inverted index
            term_frequencies[token] += 1  # Update the term frequency

    return inverted_index, term_frequencies

# Specify the folder path
folder_path = '/content/drive/MyDrive/Documents'

# Load the text files from the directory
data = {}
doc_id = 1
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r') as file:
            text = file.read()
            data[doc_id] = text
            doc_id += 1

# Build the inverted index
inverted_index, term_frequencies = build_inverted_index(data)

# Print the inverted index
print("Inverted Index:")
for token, doc_ids in inverted_index.items():
    print(f"{token}: {doc_ids}")

# Print the term frequencies
print("\nTerm Frequencies:")
for token, frequency in term_frequencies.items():
    print(f"{token}: {frequency}")

Inverted Index:
python: {1, 2, 3, 4, 5}
data: {1, 2, 3, 4}
types: {1, 2, 3}
every: {1, 4}
value: {1, 4, 5}
has: {1}
a: {1, 2, 3, 4, 5}
datatype: {1}
and: {1, 3, 4, 5}
variables: {1, 5}
can: {1, 2, 3, 4, 5}
hold: {1}
values: {1, 4}
is: {1, 2, 3, 4, 5}
powerfully: {1}
composed: {1}
language: {1, 4, 5}
consequently: {1}
we: {1, 2, 3, 4, 5}
dont: {1}
have: {1, 3, 4, 5}
to: {1, 2, 3, 4, 5}
characterize: {1}
the: {1, 2, 3, 4, 5}
sort: {1}
of: {1, 2, 3, 4, 5}
variable: {1, 2, 4, 5}
while: {1}
announcing: {1}
it: {1, 4, 5}
interpreter: {1}
binds: {1}
implicitly: {1}
its: {1, 4, 5}
type: {1, 5}
5: {1, 3, 4}
did: {1}
not: {1, 3, 5}
specify: {1, 5}
which: {1}
five: {1}
from: {1, 4}
an: {1, 3, 4, 5}
integer: {1, 5}
will: {1, 5}
automatically: {1}
interpret: {1}
as: {1, 2, 3, 4, 5}
verify: {1}
programused: {1}
thanks: {1}
function: {1, 5}
in: {1, 2, 3, 4, 5}
returns: {1}
passed: {1}
consider: {1}
following: {1, 2, 5}
illustration: {1}
when: {1, 4}
defining: {1}
verifying: {1}
various: {1, 3, 5}
adv

**Boolean Query Processing: AND Operation**

In [None]:
def boolean_query_and(inverted_index, term1, term2):
    """
    Performs a Boolean AND query on the inverted index.

    Args:
        inverted_index (dict): The inverted index constructed earlier.
        term1 (str): The first query term.
        term2 (str): The second query term.

    Returns:
        set: A set of document IDs that contain both query terms.
    """
    doc_ids_term1 = inverted_index.get(term1, set())
    doc_ids_term2 = inverted_index.get(term2, set())

    return doc_ids_term1 & doc_ids_term2

# Example usage:
term1 = "python"
term2 = "variables"

result = boolean_query_and(inverted_index, term1, term2)

print(f"Documents containing both '{term1}' and '{term2}': {result}")

# Step 3: Perform AND query processing
def boolean_query_and(query_terms, inverted_index):
    # Clean and process the query terms
    cleaned_query = [LEMMATIZER.lemmatize(term.lower()) for term in query_terms]

    # Find document sets for each query term
    doc_sets = [inverted_index[term] for term in cleaned_query if term in inverted_index]

    # Perform intersection to find documents that contain all terms
    if doc_sets:
        result_docs = set.intersection(*doc_sets)
    else:
        result_docs = set()  # If no term exists, return an empty set

    return result_docs

Documents containing both 'python' and 'variables': {1, 5}


**Main Function**

In [None]:
def main():
    global final_tokens  # Access the global final_tokens variable
    final_tokens = [] # Initialize final_tokens as a list
    dir_path = '/content/drive/MyDrive/Documents' # Changed file_path to dir_path for clarity
    for filename in os.listdir(dir_path):
        if filename.endswith(".txt"): # Check if item in directory is a file ending in '.txt'
            file_path = os.path.join(dir_path, filename) # Create full path to file
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:

                    # Changed 'process_text' to 'clean_text'
                    line_tokens = clean_text(line)
                    final_tokens.extend(line_tokens)

    with open("processed_tokens_Kritika Joshi.txt", "w") as file:
        for i in range(0, len(final_tokens), 3):
            file.write(" ".join(final_tokens[i:i+3]) + "\n")

    print(f"Final tokens: {final_tokens}")
    print(f"Number of tokens: {len(final_tokens)}")

if __name__ == "__main__":
    main()

Final tokens: ['python', 'data', 'types', 'every', 'value', 'has', 'a', 'datatype', 'and', 'variables', 'can', 'hold', 'values', 'python', 'is', 'a', 'powerfully', 'composed', 'language', 'consequently', 'we', 'dont', 'have', 'to', 'characterize', 'the', 'sort', 'of', 'variable', 'while', 'announcing', 'it', 'the', 'interpreter', 'binds', 'the', 'value', 'implicitly', 'to', 'its', 'type', 'a', '5', 'we', 'did', 'not', 'specify', 'the', 'type', 'of', 'the', 'variable', 'a', 'which', 'has', 'the', 'value', 'five', 'from', 'an', 'integer', 'the', 'python', 'interpreter', 'will', 'automatically', 'interpret', 'the', 'variable', 'as', 'an', 'integer', 'we', 'can', 'verify', 'the', 'type', 'of', 'the', 'programused', 'variable', 'thanks', 'to', 'python', 'the', 'type', 'function', 'in', 'python', 'returns', 'the', 'type', 'of', 'the', 'passed', 'variable', 'consider', 'the', 'following', 'illustration', 'when', 'defining', 'and', 'verifying', 'the', 'values', 'of', 'various', 'data', 'types'

In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

# Initialize lemmatizer and stopwords
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))

# Step 1: Clean and tokenize the documents
def clean_text(text):
    """Performs text cleaning: removing special characters, numbers, tokenization, stopword removal, and lemmatization."""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers using regular expressions
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Removes special characters and numbers

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]

    return cleaned_tokens

# Example documents
documents = [
    "Doc_1: Python Variables: A variable is the name given to a memory location. It has 5 types.",
    "Doc_2: Python Data Types: Every value has a datatype, and variables can hold values. Numbers like 12345 are removed.",
    "Doc_3: Python Literals: Python Literals can be defined as data that is given in a variable or constant like 789.",
    "Doc_4: Python Tuples: A comma-separated group of items is called a Python triple with 3 values.",
    "Doc_5: Python Arrays: The Array is used in every programming language like C, C++, Java, Python, R, JavaScript."
]

# Step 2: Build an inverted index
def build_inverted_index(docs):
    inverted_index = defaultdict(set)  # Maps term -> set of document IDs
    for i, doc in enumerate(docs):
        cleaned_tokens = clean_text(doc)
        for token in cleaned_tokens:
            inverted_index[token].add(i)
    return inverted_index

# Step 3: Perform AND query processing
def boolean_query_and(query_terms, inverted_index):
    # Clean and process the query terms
    cleaned_query = [LEMMATIZER.lemmatize(term.lower()) for term in query_terms]

    # Find document sets for each query term
    doc_sets = [inverted_index[term] for term in cleaned_query if term in inverted_index]

    # Perform intersection to find documents that contain all terms
    if doc_sets:
        result_docs = set.intersection(*doc_sets)
    else:
        result_docs = set()  # If no term exists, return an empty set

    return result_docs

# Build the inverted index
inverted_index = build_inverted_index(documents)

# Step 4: Query example
query = ["python", "variable"]
result = boolean_query_and(query, inverted_index)

# Display results
print(f"Documents that match the query {query}:")
for doc_id in result:
    print(f"Doc_{doc_id + 1}: {documents[doc_id]}")


Documents that match the query ['python', 'variable']:
Doc_1: Doc_1: Python Variables: A variable is the name given to a memory location. It has 5 types.
Doc_2: Doc_2: Python Data Types: Every value has a datatype, and variables can hold values. Numbers like 12345 are removed.
Doc_3: Doc_3: Python Literals: Python Literals can be defined as data that is given in a variable or constant like 789.
