#### Get Source Data:

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import openai
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()

openai_key = os.getenv("OPENAI_API_KEY")

In [None]:
graph_config = {
   "llm": {
      "api_key": openai_key,
      "model": "openai/gpt-3.5-turbo", # specify model here
   },
}

In [None]:

smart_scraper_graph = SmartScraperGraph(
   prompt="""Extract all links from the given documentation page.
               """,
   source="-------------- Your link of the site to fetch data from ---------------",
   config=graph_config
)

result = smart_scraper_graph.run()
print(result)

In [None]:
base_url = "-----------Add base link to combine with extracted links------------"  

full_links = [base_url + link for link in result['links']]  

for full_link in full_links:  
    print(full_link)

In [None]:
filtered_links = [link for link in full_links if link.count("https") ==1]  
len(filtered_links)

In [None]:
def json_to_text(data, indent=0):
    """Recursively converts JSON data to plain text."""
    result = ''
    indent_str = ' ' * indent

    if isinstance(data, dict):
        for key, value in data.items():
            result += f"{indent_str}{key}:\n"
            result += json_to_text(value, indent + 2)
    elif isinstance(data, list):
        for index, item in enumerate(data):
            result += f"{indent_str}- Item {index + 1}:\n"
            result += json_to_text(item, indent + 2)
    else:
        result += f"{indent_str}{data}\n"

    return result

In [None]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs = []

for link in filtered_links:
    smart_scraper_graph = SmartScraperGraph(
        prompt="""Extract all key information including headings with paragraphs, lists, source codes and any relevant data points from the provided link.
                """,
        source=link,
        config=graph_config
    )

    out = smart_scraper_graph.run()
    convert = json_to_text(out)
    print(convert, '\n\n')

    docs.append(Document(page_content=convert))
    print(f'\niterated on link {link}\n\n')

#### Splitting

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000, chunk_overlap=200, add_start_index=True
)

all_splits = text_splitter.split_documents(docs)

print(len(all_splits))


#### Storing in Vector Store

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

#### Specifying Model

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

#### Generate Data Using RAG:

In [None]:
df = pd.read_csv('--------- Your Data Frame Containing Prompts -----------')
df

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

reses = []

for i, row in df.iterrows():

    question = row['prompt']

    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

    retrieved_docs = retriever.invoke(question)

    len(retrieved_docs)

    template = """
        ---------------- Your Prompt Here For Generating Data ---------------
    """
    custom_rag_prompt = PromptTemplate.from_template(template)

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
    )

    response = rag_chain.invoke(question)
    readable_response = response.replace('\\n', '\n').replace('\\t', '\t')
    reses.append({'prompt': question, 'responses': resp})

    print(resp)

#### Dump Generated Data into DataFrame

In [None]:
data = pd.DataFrame(reses)
data.to_csv('data.csv', index = False)