# Overview

https://python.langchain.com/docs/integrations/document_loaders/microsoft_word/

# Setup

In [1]:
import os

os.environ["AZURE_OPENAI_ENDPOINT"] = "https://dalle3-swo.openai.azure.com/"
os.environ["AZURE_OPENAI_API_KEY"] = "e51119f8d8774069a6594d92ccf7a70d"

In [2]:
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_community.vectorstores import Chroma
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain_core.output_parsers import BaseOutputParser
from typing import List, Optional
import re
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import chain
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_core.output_parsers import StrOutputParser

# LLM
llm = AzureChatOpenAI(
    openai_api_version="2024-02-15-preview",
    azure_deployment="gpt-35-turbo",
    temperature=0
) 

gpt_35_turbo_16k = AzureChatOpenAI(
    openai_api_version="2024-02-15-preview",
    azure_deployment="gpt-35-turbo-16k",
    temperature=0
) 

smart_llm = AzureChatOpenAI(
    openai_api_version="2024-02-15-preview",
    azure_deployment="gpt-4",
    temperature=0
) 
gpt_35_turbo_instruct = AzureChatOpenAI(
    openai_api_version="2024-02-15-preview",
    azure_deployment="gpt-35-turbo-instruct",
    temperature=0
) 
# Embedding
embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    openai_api_version="2024-02-15-preview",
)

In [5]:
file_path = "../../data/qna_update_data/PSG Programme Handbook.docx"

# Using Docx2text

Drawbacks:
- Not extract table well

In [3]:
from langchain_community.document_loaders import Docx2txtLoader
loader = Docx2txtLoader("../../data/qna_update_data/PSG Programme Handbook.docx")

docx2txt_data = loader.load()


In [4]:
print(docx2txt_data[0].page_content)

PERSONAL & SOCIAL GROWTH (PSG) PROGRAMME 

HANDBOOK



This handbook provides BUV students with key information about the Personal and Social Growth (PSG) Programme, including the programme introduction, benefits for students' growth, main offerings with respective PSG points and rewards, frequently asked questions (FAQs), and other information.

 

Students are encouraged to read this handbook thoroughly, consult PSG Team and other student support teams to select suitable PSG activities, and design an individualised development plan at the beginning of every academic year. To get the latest information about PSG activities with points and rewards, as well as to review and adjust the PSG plan based on different development paths, distinct characteristics and aspirations, students should regularly revisit the handbook throughout their years at BUV.  



 

 



TABLE OF CONTENTS

	I. INTRODUCTION	3

	1. Personal and Social Growth (PSG) programme Overview	3

	2. Benefits of the PSG Progr

# Using unstructured

In [6]:
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
unstructured_loader = UnstructuredWordDocumentLoader(file_path)
unstructured_data = unstructured_loader.load()

[nltk_data] Downloading package punkt to /Users/macos/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/macos/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [10]:
print(unstructured_data[0].page_content)

PERSONAL & SOCIAL GROWTH (PSG) PROGRAMME 

HANDBOOK

This handbook provides BUV students with key information about the Personal and Social Growth (PSG) Programme, including the programme introduction, benefits for students' growth, main offerings with respective PSG points and rewards, frequently asked questions (FAQs), and other information.

Students are encouraged to read this handbook thoroughly, consult PSG Team and other student support teams to select suitable PSG activities, and design an individualised development plan at the beginning of every academic year. To get the latest information about PSG activities with points and rewards, as well as to review and adjust the PSG plan based on different development paths, distinct characteristics and aspirations, students should regularly revisit the handbook throughout their years at BUV.  

I. INTRODUCTION

1. Personal and Social Growth (PSG) programme Overview

The Personal and Social Growth (PSG) Programme is a unique initiative

# Azure AI document intelligence loader

In [3]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

endpoint = "https://di-buv.cognitiveservices.azure.com/"
key = "fe9f945227944764889fd95f3bb2abda"


loader = AzureAIDocumentIntelligenceLoader(
            api_endpoint=endpoint, 
            api_key=key, 
            file_path="../../data/qna/2. SU-JAN24-FREQUENTLY ASKED QUESTIONS.docx", 
            api_model="prebuilt-layout"
)
azure_ai_DI_data = loader.load()

In [4]:
azure_ai_DI_data[0].page_content



# Create langchain document

In [5]:
import re  
  
def extract_qna_from_extracted_word(extracted_word):
    raw_splitted_text = re.split(r"\n\nAnswer / Câu trả lời:\n\n", extracted_word)
    q_a_list = []
    for element in raw_splitted_text:
        split_question = re.split(r"\n\n", element)
        if len(split_question) == 1:
            current_quesions = split_question[0]
            q_a_list.append(current_quesions)
        else:
            answer_of_current_quesions, next_question = split_question[:-1], split_question[-1]
            answer_of_current_quesions = "\n\n".join(answer_of_current_quesions)
            q_a_list.extend([answer_of_current_quesions, next_question])
    return q_a_list


In [6]:
list_qna = extract_qna_from_extracted_word(azure_ai_DI_data[0].page_content)

In [8]:
refine_qna_list = list_qna[1:-1]

In [9]:
from langchain.docstore.document import Document

docs = []
for i in range(int(len(refine_qna_list)/2)):
    # page_content is question, metadata is answer
    document = Document(page_content=refine_qna_list[2*i],
                        metadata={"answer":refine_qna_list[2*i+1]})
    docs.append(document)


# Create vector store

In [16]:
# # Child splitter - RecursiveCharacterTextSplitter
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1024, chunk_overlap=200, add_start_index=True
# )
# all_splits = text_splitter.split_documents(md_header_splits)

# Save to disk
Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory="./chroma_db/SU_QnA")


<langchain_community.vectorstores.chroma.Chroma at 0x1405c4610>