In [9]:
import pandas as pd
import json
import string

# Load the CSV dataset
df = pd.read_csv("semi_strut.csv")

# Tokenization function to extract terms from the JSON-like content
def tokenize_content(content):
    content_dict = json.loads(content)
    terms = []
    
    # Extract terms from various fields (title, author)
    terms.extend(content_dict.get("title", "").split())
    terms.extend(content_dict.get("author", "").split())
    # Extract keywords
    keywords = content_dict.get("keywords", [])
    terms.extend(keywords)
    
    # Extract terms from sections' titles and content
    sections = content_dict.get("sections", [])
    for section in sections:
        terms.extend(section.get("title", "").split())
        terms.extend(section.get("content", "").split())
    
    return terms

# Apply the tokenization function to all rows in the DataFrame and create a new column "Terms"
df["Terms"] = df["Content"].apply(tokenize_content)

# Preprocess the terms and create another new column "Terms_preprocessed"
def preprocess_terms(terms):
    # Define a set of common stop words
    stop_words = set([
        "a", "an", "the", "and", "is", "in", "it", "to", "of", "for", "on", "with", "as"
    ])
    
    # Remove punctuation and convert to lowercase
    terms = [term.lower().strip(string.punctuation) for term in terms]
    
    # Remove stop words
    terms = [term for term in terms if term not in stop_words]
    
    return terms

df["Terms_preprocessed"] = df["Terms"].apply(preprocess_terms)

# Initialize an empty inverted index dictionary
inverted_index = {}

# Build the inverted index
for index, row in df.iterrows():
    document_id = row["Document ID"]
    terms = row["Terms_preprocessed"]
    
    # Update the inverted index with terms and document IDs
    for term in terms:
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(document_id)

# Display the inverted index
print("Inverted Index:")
print(inverted_index)

# Perform Boolean operations on postings lists for Boolean search operations

# 1. "Python" OR "Pandas"
query1 = ["python", "pandas"]
result1 = set()

for term in query1:
    if term in inverted_index:
        result1.update(inverted_index[term])

print("Documents matching 'Python' OR 'Pandas':", result1)

# 2. "Python" AND "data"
query2 = ["python", "data"]
result2 = set(df["Document ID"])  # Initialize with all document IDs

for term in query2:
    if term in inverted_index:
        result2.intersection_update(inverted_index[term])

print("Documents matching 'Python' AND 'Data':", result2)


Inverted Index:
{'introduction': {1, 2, 4, 5}, 'python': {1, 2, 3, 4, 5}, 'john': {1}, 'doe': {1}, 'programming': {1}, 'beginner': {1}, 'getting': {1, 3}, 'started': {1, 3}, 'versatile': {1, 5}, 'language': {1}, 'basic': {1}, 'syntax': {1}, 'easy': {1}, 'understand': {1}, 'data': {2, 5}, 'analysis': {2}, 'pandas': {2}, 'jane': {2}, 'smith': {2}, 'data analysis': {2}, 'popular': {2}, 'library': {2, 5}, 'dataframes': {2}, 'are': {2}, 'core': {2}, 'structure': {2}, 'web': {3}, 'development': {3}, 'flask': {3}, 'mike': {3}, 'johnson': {3}, 'web development': {3}, 'lightweight': {3}, 'framework': {3}, 'routing': {3}, 'defines': {3}, 'url': {3}, 'patterns': {3}, 'views': {3}, 'machine': {4}, 'learning': {4}, 'scikit-learn': {4}, 'emily': {4}, 'davis': {4}, 'machine learning': {4}, 'subfield': {4}, 'artificial': {4}, 'intelligence': {4}, 'supervised': {4}, 'type': {4}, 'visualization': {5}, 'matplotlib': {5}, 'robert': {5}, 'clark': {5}, 'data visualization': {5}, 'creating': {5}, 'plots': {5