In [18]:
import os
import pandas as pd

In [19]:
current_dir = os.getcwd()
current_dir

'/home/cxiang/aiha_new'

In [20]:
resource_name = "90YearsofDedication"
md_result_path = os.path.join(current_dir, f"results/md/{resource_name}")

### Create db directory

In [21]:
db_path = os.path.join(current_dir, f"chroma")
media_path = os.path.join(db_path, f"media/{resource_name}")
if not os.path.exists(media_path):
    os.makedirs(media_path)

### Load MD resources

In [22]:
md_path = os.path.join(current_dir, f"{md_result_path}/{resource_name}.md")
with open(md_path, "r") as file:
    md = file.read()

move picture files to db

In [23]:
for file in os.listdir(md_result_path):
    if file.endswith(".jpeg") or file.endswith(".png"):
        os.rename(f"{md_result_path}/{file}", f"{media_path}/{file}")

### Chunk MD

In [24]:
import re

# Split by headers (# to ####)
sections = re.split(r'\n(?=#{1,4}\s)', md)

# Filter out empty sections and clean up
chunks = [section.strip() for section in sections if section.strip()]

# Create chunks dictionary with section titles and content
chunks_grouped = []
section_titles = []  # List to store just the titles

parent_section = None
for i, chunk in enumerate(chunks):
    # Get the title from first line (if it starts with #)
    title_match = re.match(r'^#{1,4}\s+(.+)$', chunk.split('\n')[0])
    if title_match:
        title = title_match.group(1)
        section_titles.append(title)
    else:
        title = "Untitled Section"
        section_titles.append(title)
        
    # Check if the current chunk is too short
    if len(chunk) < 200 and parent_section:
        # Append to the parent section
        chunks_grouped[parent_section] += "\n" + chunk
        chunks_grouped.append(None)
    else:
        # Store chunk with its title
        chunks_grouped.append(chunk)
        parent_section = i

# Print number of chunks and titles found
df = pd.DataFrame({"chunks":chunks_grouped, "title": section_titles })
df.dropna(inplace=True, axis=0)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   chunks  284 non-null    object
 1   title   284 non-null    object
dtypes: object(2)
memory usage: 4.6+ KB


Save the chunks in db for citation use

In [25]:
source_path = os.path.join(current_dir, f"chroma/source/{resource_name}")
if not os.path.exists(source_path):
    os.makedirs(source_path)
df.to_csv(f"{source_path}/chunks.csv", index=True, index_label="chunk_id")

Split chunks that are too big

In [26]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize text splitter with desired parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=128,
    length_function=len,
)

# Create lists to store the new chunks
new_texts = []
original_chunk_ids = []
chunk_titles = []
chunk_images = []
new_chunk_ids = 0

# Regular expression to find markdown image syntax
img_pattern = r'!\[.*?\]\((.*?)\)'


# Process each chunk in the original dataframe
for idx, row in df.iterrows():
    chunks = text_splitter.split_text(row['chunks'])
        
    # If the text needs to be split
    for chunk in chunks:
        # Replace image paths with full media path
        chunk = re.sub(img_pattern, 
                        lambda m: f'![](chroma/media/{resource_name}/{os.path.basename(m.group(1))})', 
                        chunk)
        
        # Extract images from the chunk
        images = re.findall(img_pattern, chunk)
        images = [os.path.join(media_path, img) for img in images]

        new_texts.append(chunk)
        original_chunk_ids.append(idx)
        chunk_titles.append(row['title'])
        chunk_images.append(images)  # Add all images found in the original chunk
        new_chunk_ids += 1

# Create new dataframe with split chunks
chunks_to_embed = pd.DataFrame({
    'original_chunk_id': original_chunk_ids,
    'text': new_texts,
    'resource_name': [resource_name] * len(new_texts),
    'images': chunk_images,
    'new_chunk_id': [f"{resource_name}_{i}" for i in range(new_chunk_ids)],
    'chunk_title': chunk_titles
})

print(f"Original chunks: {len(df)}")
print(f"New chunks: {len(chunks_to_embed)}")
chunks_to_embed.to_json("temp.json", orient="records")

Original chunks: 284
New chunks: 2079


### Create chroma DB

In [27]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb.utils.data_loaders import ImageLoader

data_loader = ImageLoader()

# clip_ef = embedding_functions.OpenCLIPEmbeddingFunction()
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="BAAI/bge-m3"
)

client = chromadb.PersistentClient(db_path)

In [28]:
collection = client.get_or_create_collection(name="md_data", embedding_function=sentence_transformer_ef)

collection.add(
    documents=chunks_to_embed["text"].tolist(),
    metadatas=chunks_to_embed[["original_chunk_id", "resource_name", "chunk_title"]].to_dict(orient="records"),
    ids=chunks_to_embed["new_chunk_id"].tolist(),
    # uris=chunks_to_embed["images"].tolist()
)

In [29]:
collection.query(
    query_texts=["Engineering"],
    n_results=5
)

{'ids': [['90YearsofDedication_1527',
   '75YearsofEngineering_7',
   '75YearsofEngineering_566',
   '90YearsofDedication_952',
   '90YearsofDedication_87']],
 'embeddings': None,
 'documents': [['# Engineering Education in the New Millennium',
   '## ENGINEERING TOPICS',
   '#### INDUSTRIAL ENGINEERING',
   '- Engineering\n- Wireless Communications',
   'Engineering, as explained in the dictionary, means "the practical application of scientific knowledge in the design, construction and control of machines, public services such as roads, bridges, etc., electrical apparatus, chemicals, etc." It is therefore apparent that engineering affects every aspect of our communities, and it will always remain essential and vital for the betterment of the quality of our lives.\n\n![](chroma/media/90YearsofDedication/_page_26_Picture_4.jpeg)']],
 'uris': None,
 'data': None,
 'metadatas': [[{'chunk_title': 'Abstract',
    'original_chunk_id': 204,
    'resource_name': '90YearsofDedication'},
   {'ch