## Introduction
There are other libraries like requests and bs4 that make web scraping easier by fetching the HTML of web pages. However, in this particular case, the questions and answers in the texts didn't follow a consistent pattern, either in classes or formatting. Therefore, I decided it would be less labor-intensive and more efficient to perform this web scraping using a slightly more "manual" method.

# Library Imports

In [1]:
#import libraries
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
import pandas as pd

# URLs and Key Text Sections

In [2]:
#list of FAQ URLs from goias.gov
urls = ["https://goias.gov.br/social/perguntas-e-respostas-frequentes-aprendiz-do-futuro/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-dignidade/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-maes-de-goias/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-goias-por-elas/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-sistema-socioeducativo/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-auxilios-agua-e-energia-pao-e-leite/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-carteira-de-identificacao-do-autista2/",
        "https://goias.gov.br/social/bolsa-familia/",
        "https://goias.gov.br/social/cadastro-unico-cadunico/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-passaporte-intermunicipal-da-pessoa-idosa/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-passe-livre-intermunicipal-da-pessoa-com-deficiencia/",
        "https://goias.gov.br/social/perguntas-e-respostas-frequentes-centro-de-referencia-estadual-da-igualdade-crei/"]

#Before first question of each URL, the text described in the "start" variable will always appear
start = """Categoria  

Perguntas e Respostas Frequentes"""

#At the end of the last answer of each URL, the text described in the "end" variable will always appear
end = """


 











 









 













 

 





 Governo na palma da mão"""

#we need this text complement to capture questions that contain a period in the middle of the sentence, for example:
#Fui vítima de racismo. O que devo fazer?
text_complement = "Fui vítima de"


# Creating Lists and Extracting Texts

In [45]:
#initialize the lists we will need to extract texts from URLs using langchain
raw_texts = []
content_texts = []
processed_texts = []

#extract only the texts from the URLs
for url in urls:
    texts = WebBaseLoader(url)
    raw_texts = texts.load()
    content_texts.append(raw_texts[0].page_content)

# Extracting Only the Questions and Answers

In [46]:
#determine the length of the text variables defined above for future operations
start_length = len(start)
end_length = len(end)
complement_length = len(text_complement)

#Extract only the questions and answers from the texts and assign them to the processed_texts list
for individual_answers in content_texts:
    for j in range(len(individual_answers) - start_length):
        if start in individual_answers[j:j + start_length]:
            phrase_start = j + start_length
        elif end in individual_answers[j:j + end_length]:
            processed_texts.append(individual_answers[phrase_start + 6 : j])

# Initializing the Lists

In [48]:
#initialize lists for future operations
split_texts = []
questions = []
questions_pt = []
answers = []
answers_pt = []

# Extracting Questions from the Text

In [49]:
#Iterating through each text in the processed_texts list
for text in processed_texts:
    #Iterating over each character and its respective index in the text
    for i, character in enumerate(text):
        aux = i 
        #Search for a question mark in the characters of the text
        if "?" in character:
            #Initialize the question variable with an empty string
            question = ""
            #Initialize the count variable
            count = 1
            #Initialize the counter
            counter = 0
            #Remain in the loop until counter is not equal to 1
            while counter != 1:
                #If the text at index i - count contains any of the following characters: "." or ";" or "\n", the variable j will receive i - count and 
                #counter receives 1 to end the loop. This is necessary because the index i is indicating the character "?", i.e., the end
                #of a question. So we need to scan the string to find its beginning. Before the beginning of each question, there is the period
                #of the previous question's answer (questions[i - 1]). In some isolated cases in the text, the end of the question is determined by ;
                #or by \n, the latter being for the first questions in each URL.
                if text[i - count] == "." or text[i - count] == ";" or text[i - count] == "\n" or (i - count) == 0:
                    j = i - count
                    counter = 1
                #If not, count is incremented by 1 to continue searching for the characters ".", ";" or "\n"
                else:
                    count += 1
            #Reset the count variable
            count = 0
            #Remain in the loop while j (which we assigned the value of i - count) + count is less than or equal to i
            while j + count <= i:
                #The question variable receives question (in the first iteration it is an empty string) + text[j + count], obtaining the entire question
                question = question + text[j + count]
                #Increment the count variable by 1
                count += 1
            #If the question is of the special case specified below, where there is a period in the question, it will go through this if
            if question == ". O que devo fazer?":
                #Aux receives i
                aux = i
                #While text_complement, i.e., "I was a victim of" is not in the text from aux to aux + text_complement length
                #the loop continues
                while text_complement not in text[aux : aux + len(text_complement)]:
                    #Aux is decremented by 1 to continue the search
                    aux -= 1
                #In the end, when the text_complement is detected in the specified range, the loop will stop, so we assign
                #the question variable to the text from aux to i + 1 (to include the period)
                question = text[aux : i + 1]
            #Finally, append the question to the questions list to store each question
            questions.append(question)

# Cleaning the Data in the Questions List

In [50]:
#Iterate over each index and each string in the questions list
for i, question in enumerate(questions):
    #If the question is 'What is the CREI?', the question is deleted. This happens because there are two question marks in the question, causing
    #our algorithm above to interpret it as two questions
    if question == 'O que é o CREI?':
        del questions[i]
#Use the replace function to remove the whitespace from the questions and replace it with an empty string
questions = [character.replace("\n", "") for character in questions]

# Extracting Answers from the Text

In [51]:
#Reset the count
count = 0
#While the count is less than the length of the questions list
while count < len(questions):
    #Iterate over each text in the processed_texts list
    for text in processed_texts:
        #Iterate over each character and its respective index in the text
        for i, character in enumerate(text):
            #While count is less than the length of the questions list - 1
            if count < len(questions) - 1:
                #If questions[count] (in the first iteration, questions[0]) is present in text[i to i + question length]
                if questions[count] in text[i: i + len(questions[count])]:
                    #response_start receives the last character of the question, i.e., "?", + 1 to get the start of the answers
                    response_start = i + len(questions[count]) + 1
                    #If count + 1 is less than the length of the questions list
                    if count + 1 < len(questions):
                        #j receives response_start because the value of response_start cannot be lost
                        j = response_start
                        #j will iterate through the length of the text - length of the next question
                        for j in range(len(text) - len(questions[count + 1])):
                            #If the next question is contained from j to j + the length of the next question
                            if questions[count + 1] in text[j: j + len(questions[count + 1])]:
                                #Then append from response_start to j - 1, this is done to not add the first character
                                #of the question, to the answers list to store each answer
                                answers.append(text[response_start : j - 1])
                                #count receives an increment of 1
                                count += 1
                                break
                            #If the next question is not in the text and the length of j + the length of the next question is == the length of the text
                            #This is necessary to store the last answer of the last question of each URL
                            if questions[count + 1] not in text[j: j + len(questions[count + 1])] and j + len(questions[count + 1]) == len(text) - 1:
                                #Then append from i + the length of the current question to the answers list to store each answer
                                answers.append(text[i + len(questions[count]) :])
                                count += 1
                                break
            #If it's the last question
            elif count == len(questions) - 1:
                #Append from i to the end of the text, as everything else is part of the answer
                answers.append(text[i:])
                #Increment count by 1 and end the loop
                count += 1

# Cleaning the Data in the Answers List

In [52]:
#The replace function is used to replace "\n" and "– " with a blank space
answers = [character.replace("\n", "") for character in answers]
answers = [character.replace("– ", "") for character in answers]

# Storing Questions and Answers in a Single List

In [53]:
#Create the all_data list to store both questions and answers
all_data = []

#Iterate over the size of the questions list (both lists have the same size)
for item in range(len(questions)):
    question = questions[item]
    answer = answers[item]
    #Append the questions and answers to the all_data list
    all_data.append({'question': question, 'answer': answer})

# Creating a .csv File Containing Questions and Answers

In [59]:
#The all_data list is transformed into a dataframe object
data_df = pd.DataFrame(all_data)

#A .csv file is created containing the questions and answers
data_df.to_csv('rag_doc.csv', index=False)