## Context

In [31]:
import docx2txt
import re
from pathlib import Path
import os
from langchain.docstore.document import Document
from pathlib import Path
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

In [34]:
# Define paths
CUSTOMIZATIONSTXT_PATH = r"..\data\processed\customizations_doc.txt"
FAQTXT_PATH = r"..\data\processed\faq_doc.txt"
UPDATED_FAQTXT_PATH = r"..\data\processed\updated_faq_doc.txt"
CHROMADB_PATH = r"..\chromadb"

In [20]:
def read_docx(filepath: str) -> str:
    raw_text = docx2txt.process(filepath)
    lines = []
    for line in raw_text.split("\n"):
        if line == "":
            continue
        lines.append(line.strip())
    processed_text = "\n".join(lines)
    return processed_text


def create_updated_faq(
        faq_filepath: str,
        customizations_filepath: str,
        updated_faq_filepath: str = r".\updated_faq_doc.txt"
        ) -> None:
    with open(faq_filepath, "r", encoding="utf-8-sig") as f:
        faq_data = f.read()

    new_faq_entries = generate_customizations_faq_entry(customizations_filepath)

    updated_faq_data = faq_data + "\n" + "\n".join(new_faq_entries)
    
    with open(updated_faq_filepath, "w", encoding="utf-8-sig") as f:
        f.write(updated_faq_data)


def parse_faq_file(filepath: str) -> list[str]:
    with open(filepath, "r", encoding="utf-8-sig") as f:
        faq_data = f.read()

    pattern = re.compile(r'Q:(.*)\nA:(.*)')
    parsed_faq_data = re.findall(pattern , faq_data)

    refactored_parsed_faq_data = [f"question:{question.strip().lower()}\n answer:{answer.strip().lower()}"
                                  for question, answer in parsed_faq_data]
    return refactored_parsed_faq_data


def parse_customizations_file(filepath: str) -> dict[str, list[str]]:
    with open(filepath, "r", encoding="utf-8-sig") as f:
        text = f.read().lower()

    pattern = re.compile(
        r'^(?P<category>[\w\s]+):\s*\n(?P<values>(?:\s*-\s*.*(?:\n|$))+)', 
        re.MULTILINE
    )

    customizations = {}
    for match in pattern.finditer(text):
        category = match.group('category').strip()
        values_block = match.group('values')

        values = re.findall(r'-\s*(.*)', values_block)
        
        if category.lower() == "sizes":
            new_values = []
            for v in values:
                if ',' in v:
                    new_values.extend([x.strip() for x in v.split(",")])
                else:
                    new_values.append(v)
            values = new_values
        customizations[category] = values
    return customizations


def generate_customizations_faq_entry(filepath: str) -> list[str]:
    customization_options = parse_customizations_file(filepath)

    new_faq_entries = []
    
    # Create an entry about available customizations
    question = f"Q: What are the available customization options of t-shirts do you have?"
    answer = f"A: We have " + ", ".join(attribute for attribute, _ in customization_options.items())

    new_faq_entry = question + "\n" + answer + "\n"
    new_faq_entries.append(new_faq_entry)

    # Create entries about specific customizations
    for attribute, values in customization_options.items():
        question = f"Q: What are the available {attribute} of t-shirts do you have?"
        answer = f"A: We have " + ", ".join(values)

        new_faq_entry = question + "\n" + answer + "\n"
        new_faq_entries.append(new_faq_entry)
    return new_faq_entries

## Convert the initial files to TXT format
- We assume, that initial customization and FAQ files are placed into the data\raw directory.

In [26]:
# Provide paths to the initial files
CUSTOMIZATIONSDOCX_PATH = Path(r"..\data\raw\Anadea homework  - Tee Customizer Shirts.docx")
FAQDOCX_PATH = Path(r"..\data\raw\Anadea homework  -Tee Customizer FAQ.docx")

assert os.path.exists(CUSTOMIZATIONSDOCX_PATH), r"Initial customization file is not present in the 'data\raw' dir."
assert os.path.exists(FAQDOCX_PATH), r"Initial FAQ file is not present in the 'data\raw' dir."

In [33]:
# Map old filesnames to the new ones
filepath_mapping = {CUSTOMIZATIONSDOCX_PATH: CUSTOMIZATIONSTXT_PATH,
                    FAQDOCX_PATH: FAQTXT_PATH}

# Check if there is a folder for processed data
if not os.path.exists(r"..\data\processed"):
    os.mkdir(r"..\data\processed")

# Convert files to .docx files to .txt
for src_path, new_path in filepath_mapping.items():
    content = read_docx(src_path)
    with open(new_path, "w", encoding="utf-8-sig") as f:
        f.write(content)

## Update FAQ file
- We will update FAQ file with the information from the customizations document.

In [28]:
# Create a new updated FAQ file
create_updated_faq(r"..\data\processed\faq_doc.txt",
                   r"..\data\processed\customizations_doc.txt",
                   r"..\data\processed\updated_faq_doc.txt")

## Create a vector database
- We will use ChromaDB as a vector database
- We will populate vdb with the information from the FAQ doc

In [36]:
# Extract quaestions and answers from the FAQ doc
faq_data = parse_faq_file(UPDATED_FAQTXT_PATH)


metadata={"source": "Tee Customizer FAQ"}
docs = [Document(text, metadata=metadata) for text in faq_data]

embedding_function = HuggingFaceEmbeddings(model_name="BAAI/llm-embedder")
vector_store = Chroma.from_documents(documents=docs,
                                     embedding=embedding_function,
                                     persist_directory=CHROMADB_PATH,
                                     collection_name="faq_collection")