In [38]:
# Import necessary libraries
import pandas as pd
import json
import string 

# Load the CSV dataset
df = pd.read_csv("semi_strut.csv")

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Document ID,Content
0,1,"{\r\n ""title"": ""Introduction to Python"",\r\n..."
1,2,"{\r\n ""title"": ""Data Analysis with Pandas"",\..."
2,3,"{\r\n ""title"": ""Web Development with Flask"",..."
3,4,"{\r\n ""title"": ""Machine Learning with Scikit..."
4,5,"{\r\n ""title"": ""Data Visualization with Matp..."


In [40]:
# Tokenization function to extract terms from the JSON-like content
# Remember to exact both 
# 1 .Extract terms from various fields (title, author)
def tokenize_content(content):
    content_dict = json.loads(content)
    terms = []
    
    # Extract terms from various fields (title, author)
    terms.extend(content_dict.get("title", "").split())
    terms.extend(content_dict.get("author", "").split())
    return terms
# 2. apply to all row in panda df , by create new column "Terms"
 # your code here.....
terms_list = []
for content in df["Content"]:
    terms_list.append(tokenize_content(content))

df["Terms"] = terms_list
df.head()

Unnamed: 0,Document ID,Content,Terms
0,1,"{\r\n ""title"": ""Introduction to Python"",\r\n...","[Introduction, to, Python, John, Doe]"
1,2,"{\r\n ""title"": ""Data Analysis with Pandas"",\...","[Data, Analysis, with, Pandas, Jane, Smith]"
2,3,"{\r\n ""title"": ""Web Development with Flask"",...","[Web, Development, with, Flask, Mike, Johnson]"
3,4,"{\r\n ""title"": ""Machine Learning with Scikit...","[Machine, Learning, with, Scikit-Learn, Emily,..."
4,5,"{\r\n ""title"": ""Data Visualization with Matp...","[Data, Visualization, with, Matplotlib, Robert..."


In [49]:
# 4. Implement a preprocessing function that converts terms to lowercase, removes punctuation, and removes common stop words.
    # Create another new column "Terms_preprocessed"
def preprocess_terms(terms):
    # Define a set of common stop words
    stop_words = set([
        "a", "an", "the", "and", "is", "in", "it", "to", "of", "for", "on", "with", "as"
    ])
    
    # Remove punctuation and convert to lowercase
    terms = [term.lower().strip(string.punctuation) for term in terms]
    
    # Remove stop words
    terms = [term for term in terms if term not in stop_words]
    
    return terms
preprocess_list = []
# your code here....
for term in df["Terms"]:
    preprocess_list.append(preprocess_terms(term))

df["Terms_prep"] = preprocess_list

In [61]:

# Initialize an empty inverted index dictionary
inverted_index = {}

# Build the inverted index
for index, row in df.iterrows():
    document_id = row["Document ID"]
    terms = row["Terms_prep"]
    
    # Update the inverted index with terms and document IDs
    for term in terms:
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(document_id)

# Display the inverted index
inverted_index

{'introduction': {1},
 'python': {1},
 'john': {1},
 'doe': {1},
 'data': {2, 5},
 'analysis': {2},
 'pandas': {2},
 'jane': {2},
 'smith': {2},
 'web': {3},
 'development': {3},
 'flask': {3},
 'mike': {3},
 'johnson': {3},
 'machine': {4},
 'learning': {4},
 'scikit-learn': {4},
 'emily': {4},
 'davis': {4},
 'visualization': {5},
 'matplotlib': {5},
 'robert': {5},
 'clark': {5}}

In [71]:
# Posting function
def or_postings(str1, str2):
    # clean data to access data from inverted_index dictionary
    str1 = preprocess_terms([str1])[0]
    str2 = preprocess_terms([str2])[0]
    
    posting1 = list(inverted_index[str1]) #'posting1' and 'posting2' are the inverted_index of the string parameters 'str1' and 'str2' stored in a list 
    posting2 = list(inverted_index[str2])
    p1 = 0 # 'p1' and 'p2' are variables for accessing the index of 'posting1' and 'posting2' 
    p2 = 0 
    result = list() 
    while p1 < len(posting1) and p2 < len(posting2): # check if 'str1' and 'str2' are in the same document 
        if posting1[p1] == posting2[p2]: # if true, increment both 'p1' and 'p2'
            result.append(posting1[p1]) 
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]: # if false, increment the inverted index that is smaller
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < len(posting1): # add inverted index to the 'result' list
        result.append(posting1[p1])
        p1 += 1
    while p2 < len(posting2):
        result.append(posting2[p2])
        p2 += 1
    return result # result return as a list

def and_postings(str1, str2):
    # clean data to access data from inverted_index dictionary
    str1 = preprocess_terms([str1])[0]
    str2 = preprocess_terms([str2])[0]
    
    posting1 = list(inverted_index[str1]) #'posting1' and 'posting2' are the inverted_index of the string parameters 'str1' and 'str2' stored in a list 
    posting2 = list(inverted_index[str2])
    p1 = 0 # 'p1' and 'p2' are variables for accessing the index of 'posting1' and 'posting2' 
    p2 = 0 
    result = list()
    while p1 < len(posting1) and p2 < len(posting2): # check if 'str1' and 'str2' are in the same document 
        if posting1[p1] == posting2[p2]: # if true, add value to 'result' then increment both 'p1' and 'p2'
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:  # if false, increment the inverted index that is smaller, but no need to add value to 'result'
            p2 += 1
        else:
            p1 += 1
    return result
# perform boolean operations on postings lists for Boolean search operations
# 1. "Python" OR "Pandas"
print("Posting List of Python OR Pandas: " + or_postings("Python", "Pandas"))
# 2. "Python" AND "data"

[1, 2]