In [None]:
import time
from datetime import datetime
import os
from ipywidgets import Layout
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from bs4 import BeautifulSoup
import markdown
import re
from databricks.connect import DatabricksSession as SparkSession
from databricks.sdk.core import Config
import pandas as pd
import pathlib
import boto3
from github import Github
from github import Auth


def call_and_save_github_api(access_token:str ) -> dict:
    map = {}

    # using an access token
    auth = Auth.Token(access_token)

    # Public Web Github
    g = Github(auth=auth)

    repo = g.get_repo("great-expectations/great_expectations")
    contents = repo.get_contents("docs")
    while contents:
        file_content = contents.pop(0)
        if file_content.type == "dir":
            contents.extend(repo.get_contents(file_content.path))
        else:
            if file_content.name.endswith('.md'):            
                content = file_content.decoded_content            

                try:
                    text, text_title = process_md_file(content.decode('utf-8'))
                    map[text_title] = text                    
                except Exception as e:
                    pass

    return map
        

def process_md_file(content):
    html = markdown.markdown(content)        
    soup = BeautifulSoup(html, 'html.parser')        
        
    # Find all occurrences of <TechnicalTag> tag
    technical_tags = soup.find_all('technicaltag')

    for tag in technical_tags:
        # Get the text attribute value
        new_text = tag['text']
        # Replace the entire <TechnicalTag> tag with the new text
        tag.replace_with(new_text)

    text = soup.get_text()        
    text_purified = text.replace("\n", " ")        
        
    # lines = text_purified.split(";")
    # filtered_lines = [line.strip() for line in lines if not line.strip().startswith("import")]
    # merged_text = ". ".join(filtered_lines)

    title_match = re.search(r'^title:\s*(.*)$', content, re.MULTILINE)        
    if title_match:
        title = title_match.group(1)
        title = title.strip()
 
        return text_purified, title 
        

In [None]:
def save_string_to_text_file(string_to_save, file_path):
    try:
        # Open the file in write mode ('w')
        with open(file_path, 'w') as file:
            # Write the string to the file
            file.write(string_to_save)

        print(f"The string has been successfully saved to '{file_path}'.")
    except IOError as e:
        print(f"Error: {e}")


In [None]:

# Define the function to extract the "document_objects" object
def extract_document_objects(PATH, all_file_mapping):

    document_objects = []

    df = return_df(PATH)
    
    for text, reply_text in df[['text','reply_text']].values:
        merged_doc = ""
        merged_doc += 'Question: ' + str(text) + '\n' + 'Answer: '+ str(reply_text)
        document_object = Document(page_content= merged_doc)
        document_objects.append(document_object)

    for title, doc in all_file_mapping.items():
        merged_doc = ""
        merged_doc += title + "\n" + doc
        document_object = Document(page_content= merged_doc)
        document_objects.append(document_object)    

    return document_objects


In [None]:
def return_df(path):
    return pd.read_csv(path)    