In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%run helper_functions.ipynb
from variables import *

all_file_mapping = process_md_files(DIRECTORY_PATH)

In [None]:
def get_databricks_data(cluster_id: str, query: str, profile: str) -> pd.DataFrame:
    # Code for connecting to Databricks, pulling data    

    CLUSTER_ID = cluster_id
    
    config = Config(profile = profile, cluster_id = cluster_id)
    spark = SparkSession.builder.sdkConfig(config).getOrCreate()

    df = spark.sql(query)
    
    return df
    

In [None]:
def filter_and_save_databricks_data(spark_df: pd.DataFrame, filter_id_list: list, save_df:bool, save_path:str) -> None:
    df = spark_df.toPandas()
   
    filtered_df = df[~df.id.isin(filter_id_list)]

    filtered_df['rownum'] = filtered_df.groupby('text').cumcount() +1

    filtered_df['reply_is_devrel'] = filtered_df['reply_is_devrel'].astype(bool)
    filtered_df['reply_has_devrel_arrow_up'] = filtered_df['reply_has_devrel_arrow_up'].astype(bool)
    
    # even though we responded to it and likely gave them an answer, it took several discussions, which can mean it's a complicated question that we don't want the LLM to train on anyway
    # this only gives us situations where we dev rel responded and we gave it a checkmark. Disregard every other messages that follow.
    dev_rel_responded_multiple_times_id = filtered_df[(filtered_df.reply_is_devrel == True) & (filtered_df.rownum > 1)].id.unique()

    filtered_more_df = filtered_df[~filtered_df.id.isin(dev_rel_responded_multiple_times_id)]

    # then get rid of messages where they are written by community users and has not received up check mark, meaning useless text
    final_df = filtered_more_df[~((filtered_more_df.reply_has_devrel_arrow_up == False) & (filtered_more_df.reply_is_devrel == False))]
    
    if save_df: 
        final_df.to_csv(save_path)
    

In [None]:
def create_new_vectorstore(path) -> None:
    document_objects = extract_document_objects(path, all_file_mapping)  
    vectorstore = Chroma.from_documents(documents=document_objects, embedding=OpenAIEmbeddings(), persist_directory = "./chroma_db")
    

In [None]:
def run_databricks_vectorstore_wrapper():
    spark_df = get_databricks_data(cluster_id = CLUSTER_ID, query = query, profile = PROFILE)
    filter_and_save_databricks_data(spark_df, filtered_id_list, save_df = True, save_path = DATABRICKS_SAVE_PATH)
    create_new_vectorstore(DATABRICKS_SAVE_PATH)
    

In [None]:
# Define the function to handle the question submission and display the results
def on_question_submit(b):
    question = text_box.value
    score_threshold = float(slider.value)

    retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": score_threshold})

    # Get the retrieved_docs
    retrieved_docs = retriever.get_relevant_documents(question)

    # Clear the previous output
    output.clear_output()

    # Check if there are relevant documents or not and display the output
    with output:
        if len(retrieved_docs) == 0:
            print("No relevant documents found.")
        else:
            print("Relevant Docs\n\n")
            for index, doc in enumerate(retrieved_docs):
                index += 1
                print(str(index) + ":", doc.page_content)
                print('\n')
                print('----------------------------------------------')
                print('\n')

# Define the function to handle the "Generate Response" button click and display the result
def generate_response(b):
    question = text_box.value
    score_threshold = float(slider.value)

    retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": score_threshold})

    retrieved_docs = retriever.get_relevant_documents(question)

    # Clear the previous output
    output.clear_output()

    # Check if there are relevant documents or not and display the output
    with output:
        if len(retrieved_docs) == 0:
            print(f"No relevant documents found with the query: {question}")
        else:        
            # Create the ChatOpenAI model and the RetrievalQA chain
            llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
            qa_chain = RetrievalQA.from_chain_type(llm, retriever= retriever, return_source_documents=True)    
            result = qa_chain({"query": question})
            # Display the result    
            print(result['result'])
            

In [None]:
if 'HEROKU_APP_CONTEXT' in os.environ:

    # Get the document_objects
    document_objects = extract_document_objects(DATABRICKS_SAVE_PATH, all_file_mapping)

    # Get the vectorstore saved previously
    vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=OpenAIEmbeddings())
    
    # Create the widgets
    text_box = widgets.Text(
        description='Write your question:',
        layout=widgets.Layout(width='50%'),
        style={'description_width': 'initial'}
    )
    
    doc_display_text_box = widgets.Text(
        value= str(len(document_objects)),
        placeholder='Type something',
        description='Number of Docs in Repository:',
        disabled=True,
        style={'description_width': 'initial'}
    )
    
    slider = widgets.FloatSlider(
        value=0.7,
        min=0.0,
        max=1.0,
        step=0.01,
        description='Score Threshold:',
        layout=widgets.Layout(width='50%'),
        style={'description_width': 'initial'}
    )
    
    submit_button = widgets.Button(description='Source Document', )
    submit_button.on_click(on_question_submit)
    
    generate_button = widgets.Button(description='Generate Response')
    generate_button.on_click(generate_response)
    
    # Create an HBox to place the buttons side by side
    buttons_box = widgets.HBox([submit_button, generate_button])
    
    # Create the output widget for displaying the results
    output = widgets.Output(
        layout=widgets.Layout(width='90%', height='300px', border='1px solid gray', overflow = 'auto')
    )
    
    title_html = '<h1><b>GX DocBot (Beta)</b></h1>'
    subtitle_html = '<h5>This application is for retrieving relevant docs related to question or generating a response.</h5>'
    subtitle_html_2 = '<h5>The app utilizes our public-facing docs + Dev Rel slack Q and A data from slack community channel.</h5>'
    subtitle_html_3 = '<h6>Score Threshold determines how relevant docs are to question. If question does not meet threshold, app will not return an answer.</h6>'
    
    title_widget = HTML(title_html)
    subtitle_widget = HTML(subtitle_html)
    subtitle_widget_2 = HTML(subtitle_html_2)
    subtitle_widget_3 = HTML(subtitle_html_3)
    
    # Display the widgets and output
    display(title_widget)
    display(subtitle_widget)
    display(subtitle_widget_2)
    display(subtitle_widget_3)
    display(HTML("<br>"))
    display(text_box)
    display(slider)
    display(doc_display_text_box)
    display(buttons_box) 
    display(output)

else:      
    run_databricks_vectorstore_wrapper()