In [None]:
# confluence section
!pip install python-dotenv requests langchain-community
!pip install langchain
!pip install atlassian-python-api
!pip install lxml
!pip install tiktoken
!pip install pandas
!pip install boto3
!pip install markdownify

# Github section

!pip install os-sys
!pip install GitPython
!pip install langchain
!pip install langchain_community
!pip install PyGithub
!pip install base64

# Getting all confluence data into a dataframe

In [1]:
import os
import sys
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
# Attempt to import BeautifulSoup and lxml, handle the exception if lxml is not installed
try:
    from bs4 import BeautifulSoup
except ImportError as e:
    raise ImportError("Attempt to import BeautifulSoup and lxml, handle the exception if lxml is not installed") from e


# Environment variable configuration
sys.path.append('../')
load_dotenv(find_dotenv())

# Confluence configuration
CONFLUENCE_URL = os.getenv("CONFLUENCE_URL")
CONFLUENCE_API_KEY = os.getenv("CONFLUENCE_API_KEY")
CONFLUENCE_USERNAME = os.getenv("CONFLUENCE_USERNAME")
CONFLUENCE_SPACE_KEY = os.getenv("CONFLUENCE_SPACE_KEY")

# Load documents from Confluence
loader = ConfluenceLoader(
    url=CONFLUENCE_URL,
    username=CONFLUENCE_USERNAME,
    api_key=CONFLUENCE_API_KEY
)
docs = loader.load(
    space_key=CONFLUENCE_SPACE_KEY,
    # limit=1,
    # max_pages=5,
    keep_markdown_format=True
)

# Split documents based on Markdown headers
def split_markdown_documents(docs):
    # Markdown
    headers_to_split_on = [
            ("#", "Title 1"),
            ("##", "Subtitle 1"),
            ("###", "Subtitle 2"),
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    # Split based on markdown and add original metadata
    md_docs = []
    for doc in docs:
        md_doc = markdown_splitter.split_text(doc.page_content)
        for i in range(len(md_doc)):
            md_doc[i].metadata = md_doc[i].metadata | doc.metadata
        md_docs.extend(md_doc)

    # RecursiveTextSplitter
    # Chunk size big enough
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=20,
        separators=[r"\n\n", r"\n", r"(?<=\. )", " ", ""]
    )

    splitted_docs = splitter.split_documents(md_docs)
    return splitted_docs


texts = split_markdown_documents(docs)

# print first 10 chunks
def pretty_print(chunks, limit=10):
    for i, chunk in enumerate(chunks[:limit]):
        print(f"Chunk {i+1} Content:\n{chunk.page_content}\n---\nMetadata:\n{chunk.metadata}\n{'='*50}\n")

# pretty_print(texts)

def create_dataframe(texts):
    # Prepare the data for the DataFrame
    data = {
        "source": ["confluence"] * len(texts),
        "page_content": [text.page_content for text in texts],
        "metadata": [text.metadata for text in texts]
    }

    # Create the DataFrame
    df = pd.DataFrame(data)

    return df

directory_path = "../sources_data"

if not os.path.exists(directory_path):
    os.makedirs(directory_path)



# Assuming texts is already populated from the split_markdown_documents function
df_confluence = create_dataframe(texts)
file_path = os.path.join(directory_path, "df_confluence.csv")
df_confluence.to_csv(file_path)


# Webpage Ingestion into a dataframe

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import re

def is_valid_url(url, base_url):
    """ Check if a URL is valid and belongs to the same domain without being a media file. """
    parsed = urlparse(url)
    return (parsed.scheme in ['http', 'https'] and
            urlparse(base_url).netloc == parsed.netloc and
            not parsed.path.endswith(tuple(['.png', '.jpg', '.jpeg', '.svg', '.pdf'])))

def crawl_and_extract_text(url, depth=3, delay=1.0):
    """ Crawl a website to a specified depth and extract text, with delays between requests. """
    visited = set()
    pages_to_visit = [(url, 0)]
    all_texts = []

    while pages_to_visit:
        current_page, current_depth = pages_to_visit.pop(0)
        if current_page not in visited and current_depth <= depth:
            visited.add(current_page)
            try:
                response = requests.get(current_page)
                response.raise_for_status()  # Ensure successful response
                soup = BeautifulSoup(response.text, 'html.parser')
                text = soup.get_text(separator=' ', strip=True)
                all_texts.append({'page_content': text, 'metadata': {'url': current_page}})
                if current_depth < depth:
                    for link in soup.find_all('a', href=True):
                        full_url = urljoin(current_page, link['href'])
                        if is_valid_url(full_url, url):
                            pages_to_visit.append((full_url, current_depth + 1))
            except requests.RequestException as e:
                print(f"Failed to retrieve {current_page}: {e}")
            time.sleep(delay)  # Delay between requests

    return all_texts

class TextSplitter:
    def __init__(self, chunk_size=1000):
        # Define the chunk size or use a default value
        self.chunk_size = chunk_size

    def split_text(self, text):
        # Use regex to split the text into sentences
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
        chunks = []

        current_chunk = ""
        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= self.chunk_size:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

def process_texts(all_texts):
    splitter = TextSplitter()
    processed_texts = []
    for text_data in all_texts:
        chunks = splitter.split_text(text_data['page_content'])
        for chunk in chunks:
            processed_texts.append({'content': chunk, 'url': text_data['metadata']['url']})
    return processed_texts

#  usage:
base_url = 'https://www.webconnex.com/'
all_texts = crawl_and_extract_text(base_url, depth=3)
web_processed_texts = process_texts(all_texts)

def create_web_dataframe(all_texts):
    """ Convert the list of text dictionaries into a DataFrame with the specified structure. """
    # Adjust the structure to include 'source' and ensure 'metadata' is a dictionary
    data = {
        'source': ['web'] * len(all_texts),
        'page_content': [text['content'] for text in all_texts],
        'metadata': [{'url': text['url']} for text in all_texts]
    }

    # Create the DataFrame
    df = pd.DataFrame(data)
    return df

# Processed web texts 
web_df = create_web_dataframe(web_processed_texts)
file_path = os.path.join(directory_path, "df_web.csv")
web_df.to_csv(file_path)

# Github ingestion into a dataframe

Change repo name to the repo that you want to query

In [None]:
repo_name = "webconnex/data-pipeline-sql"

In [None]:
import os
# from git import Repo
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from github import Github
import base64
# Get the home directory path
home_dir = os.getcwd()
# Clone the repository
GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")
g = Github(GITHUB_API_KEY)
repo = g.get_repo(repo_name)


# Create a directory for the repository
repo_dir = os.path.join(home_dir, *repo_name.split('/'))
os.makedirs(repo_dir, exist_ok=True)

# Download files from the repository
contents = repo.get_contents("")

def download_file(file_content):
    if file_content.size > 1000000:  # Check if the file is larger than 1MB
        print(f"File {file_content.path} is too large for direct API download.")
        return

    try:
        if file_content.encoding == 'base64':
            file_data = base64.b64decode(file_content.content)
            file_path = os.path.join(repo_dir, file_content.path)
            os.makedirs(os.path.dirname(file_path), exist_ok=True)

            with open(file_path, 'wb') as file:
                file.write(file_data)
            print(f"Downloaded {file_content.path}")
        else:
            print(f"Skipped {file_content.path} due to unsupported encoding or empty content")
    except Exception as e:
        print(f"Failed to download {file_content.path}: {str(e)}")

while contents:
    file_content = contents.pop(0)
    if file_content.type == "dir":
        contents.extend(repo.get_contents(file_content.path))
    elif file_content.type == "file":
        download_file(file_content)
    elif file_content.type == "symlink":
        print(f"Skipped symlink {file_content.path}")
    else:
        print(f"Skipped {file_content.path} due to unsupported file type or content")

# Load supported programming languages using LanguageParser
supported_languages = [Language.PYTHON, Language.JS, Language.JAVA, Language.GO, Language.CPP]
supported_documents = []

for lang in supported_languages:
    loader = GenericLoader.from_filesystem(
        repo_dir,
        glob="**/*",
        suffixes=[".py", ".js", ".java", ".go", ".cpp", ".c", ".cc", ".cxx", ".h", ".hpp"],
        parser=LanguageParser(language=lang, parser_threshold=500),
    )
    documents = loader.load()
    supported_documents.extend(documents)

# Load unsupported file types using a generic parsing algorithm
unsupported_loader = GenericLoader.from_filesystem(
    repo_dir,
    glob="**/*",
    suffixes=[".sql", ".yml", ".txt", ".dockerfile"],
    parser=LanguageParser(parser_threshold=500)
)
unsupported_documents = []
try:
    unsupported_documents = unsupported_loader.load()
except ValueError as e:
    print(f"Error loading documents: {str(e)}")

# Combine all documents
documents = supported_documents + unsupported_documents

# Split the documents into chunks
supported_texts = []
for lang in supported_languages:
    lang_documents = [doc for doc in supported_documents if doc.metadata['language'] == lang]
    splitter = RecursiveCharacterTextSplitter.from_language(
        language=lang, chunk_size=2000, chunk_overlap=200
    )
    lang_texts = splitter.split_documents(lang_documents)
    supported_texts.extend(lang_texts)

generic_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
unsupported_texts = generic_splitter.split_documents(unsupported_documents)

texts = supported_texts + unsupported_texts

def create_github_dataframe(texts):
    """ Convert GitHub text chunks into a DataFrame with the specified structure. """
    # Adjust structure to include 'source' and ensure 'metadata' contains necessary details
    data = {
        'source': ['github_platform_code'] * len(texts),
        'page_content': [text.content for text in texts],
        'metadata': [{'file_path': text.metadata['path'], 'language': text.metadata.get('language')} for text in texts]
    }

    # Create the DataFrame
    df = pd.DataFrame(data)
    return df

# Process and create DataFrame
github_df = create_github_dataframe(texts)
file_path = os.path.join(directory_path, "github_code.csv")
github_df.to_csv(file_path)
print(f"Total number of documents: {len(documents)}")
print(f"Total number of chunks: {len(texts)}")

That was for the code, now we gonna take care the other repo information like commits, PR and more:

In [None]:
import pickle

def collect_github_data(repo):
    github_docs = []

    # Collect branches
    branches = repo.get_branches()
    for branch in branches:
        github_docs.append({
            'content': f"Branch: {branch.name}",
            'type': 'branch',
            'repository': repo_name
        })

    # Collect pull requests
    pulls = repo.get_pulls(state='all', sort='created', base='main')
    for pr in pulls:
        github_docs.append({
            'content': f"PR #{pr.number}: {pr.title} by {pr.user.login}",
            'type': 'pull request',
            'repository': repo_name
        })

    # Collect commits
    commits = repo.get_commits(sha='main')
    for commit in commits:
        commit_data = f"Commit {commit.sha}\nAuthor: {commit.commit.author.name}\nDate: {commit.commit.author.date}\nMessage: {commit.commit.message}\n"
        files = commit.files
        for file in files:
            commit_data += f"{file.filename} +{file.additions} -{file.deletions} changes\n"
            if file.patch:
                commit_data += f"Patch:\n{file.patch}\n"
        github_docs.append({
            'content': commit_data,
            'type': 'commit',
            'repository': repo_name
        })

    return github_docs

def split_github_documents(github_docs):
    splitted_docs = []
    for doc in github_docs:
        # Split each document into chunks of up to 512 characters
        content_length = len(doc['content'])
        for i in range(0, content_length, 512):
            chunk_content = doc['content'][i:i+512]
            splitted_docs.append({
                'page_content': chunk_content,
                'metadata': {
                    'type': doc['type'],
                    'repository': doc['repository']
                }
            })
    return splitted_docs

def save_texts(texts, filename='github_data.pkl'):
    with open(filename, 'wb') as file:
        pickle.dump(texts, file)
    print(f"Data saved to {filename}")

# usage
github_docs = collect_github_data(repo)
splitted_texts = split_github_documents(github_docs)
# save_texts(splitted_texts)

def create_github_dataframe(github_docs):
    """ Convert GitHub documents into a DataFrame with standardized structure. """
    data = {
        'source': ['github_platform_info'] * len(github_docs),
        'page_content': [doc['content'] for doc in github_docs],
        'metadata': [{'type': doc['type'], 'repository': doc['repository']} for doc in github_docs]
    }

    # Create the DataFrame
    df = pd.DataFrame(data)
    return df

# Process and create DataFrame
github_docs = collect_github_data(repo)
github_df = create_github_dataframe(splitted_texts)
file_path = os.path.join(directory_path, "github_info.csv")
github_df.to_csv(file_path)