In [None]:
import os
import requests
import csv
import logging
import time
import openai
import sys
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
csv.field_size_limit(10**9)

In [None]:
CONFIG = {
    # GitHub Configuration
    "github": {
        "token": "github_pat",
        "repo_url": "add_repo_url",
        "exclude_folders": ['list of folders to exclude']
    },
    
    # API Configuration
    "api": {
        "base_url": "https://llama8b.gaia.domains/v1",
        "model_name": "llama",
        "api_key": "GAIA"
    },
    
    # File paths
    "paths": {
        "input_csv": "path_to_input_file.csv",
        "output_csv": "path_to_summarized_output.csv"
    }
}

# Create all global variables needed by functions
GITHUB_TOKEN = CONFIG["github"]["token"]
API_BASE_URL = CONFIG["api"]["base_url"]
MODEL_NAME = CONFIG["api"]["model_name"]
API_KEY = CONFIG["api"]["api_key"]

# Parameters for GitHub scraper
GITHUB_PARAMS = {
    "repo_url": CONFIG["github"]["repo_url"],
    "output_path": CONFIG["paths"]["input_csv"],
    "exclude_folders": CONFIG["github"]["exclude_folders"]
}

# Parameters for summarizer
SUMMARIZER_PARAMS = {
    "input_path": CONFIG["paths"]["input_csv"],
    "output_path": CONFIG["paths"]["output_csv"]
}

In [None]:
def get_github_contents(repo_url):
    parts = repo_url.rstrip('/').split('/')

    if len(parts) < 5 or parts[2] != "github.com":
        raise ValueError("Invalid GitHub URL. Ensure the URL is in the format: https://github.com/user/repo/tree/branch/path")

    user = parts[3]
    repo = parts[4]

    if "tree" in parts:
        branch = parts[6]
        subpath = '/'.join(parts[7:]) if len(parts) > 7 else ''
        api_url = f"https://api.github.com/repos/{user}/{repo}/contents/{subpath}?ref={branch}"
    else:
        api_url = f"https://api.github.com/repos/{user}/{repo}/contents/"

    headers = {
        "Authorization": f"Bearer {GITHUB_TOKEN}"
    }

    response = requests.get(api_url, headers=headers)
    response.raise_for_status()
    return response.json()

def process_contents(contents, paths=[], parent_path="", exclude_folders=[]):
    headers = {
        "Authorization": f"Bearer {GITHUB_TOKEN}"
    }
    for item in contents:
        path = parent_path + item['name']

        if item['type'] == 'dir' and item['name'] in exclude_folders:
            print(f"Skipping folder: {path}")
            continue

        print(f"Processing: {path}")

        if item['type'] == 'dir':
            dir_response = requests.get(item['url'], headers=headers)
            dir_response.raise_for_status()
            dir_contents = dir_response.json()
            process_contents(dir_contents, paths, path + "/", exclude_folders)
        elif item['type'] == 'file':
            file_response = requests.get(item['download_url'], headers=headers)
            file_response.raise_for_status()
            file_content = file_response.text
            paths.append({"Path": path, "Content": file_content})

    print(f"Finished processing. Total files processed: {len(paths)}.")
    return paths

def transform_and_write_csv(data, output_csv):
    with open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        for row in data:
            path = row['Path']
            content = row['Content']
            extension = os.path.splitext(path)[1]

            if extension == '.md':
                formatted_content = f"The following is a markdown document located at {path}\n------\n{content}\n------"
            elif extension == '.rs':
                formatted_content = f"```rust:{path}\n{content}\n```"
            elif extension == '.sh':
                formatted_content = f"```bash:{path}\n{content}\n```"
            elif extension == '.py':
                formatted_content = f"```python:{path}\n{content}\n```"
            elif extension == '.js':
                formatted_content = f"```javascript:{path}\n{content}\n```"
            elif extension == '.json':
                formatted_content = f"```json:{path}\n{content}\n```"
            elif extension == '.txt':
                formatted_content = f"The following is a plain text file located at {path}\n------\n{content}\n------"
            elif extension == '.toml':
                formatted_content = f"```toml:{path}\n{content}\n```"
            elif extension == '.jsx':
                formatted_content = f"```jsx:{path}\n{content}\n```"
            elif extension == '.css':
                formatted_content = f"```css:{path}\n{content}\n```"
            elif extension == '.java':
                formatted_content = f"```java:{path}\n{content}\n```"
            elif extension == '.hpp':
                formatted_content = f"```hpp:{path}\n{content}\n```"
            elif extension == '.c':
                formatted_content = f"```c:{path}\n{content}\n```"
            elif extension == '.yml':
                formatted_content = f"```yml:{path}\n{content}\n```"
            elif extension == '.xml':
                formatted_content = f"```xml:{path}\n{content}\n```"
            else:
                formatted_content = f"The following document is located at {path}\n------\n{content}\n------"
            writer.writerow([formatted_content])

def run_github_scraper(repo_url=GITHUB_PARAMS["repo_url"], 
                      output_path=GITHUB_PARAMS["output_path"], 
                      exclude_folders=GITHUB_PARAMS["exclude_folders"]):
    try:
        print(f"Starting script for repository: {repo_url}")
        contents = get_github_contents(repo_url)
        paths = process_contents(contents, exclude_folders=exclude_folders)
        transform_and_write_csv(paths, output_path)
        print(f"CSV file '{output_path}' generated successfully.")
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [None]:
class ProcessingError(Exception):
    """Custom exception for processing failures after retries"""
    pass

def create_retry_decorator():
    def after_retry(retry_state):
        if retry_state.attempt_number >= 2: 
            raise ProcessingError("Failed to process after maximum retries")
        print(f"Retry attempt {retry_state.attempt_number} after {retry_state.outcome.exception()}")

    return retry(
        retry=retry_if_exception_type((openai.APIError, openai.APITimeoutError)),
        stop=stop_after_attempt(2),
        wait=wait_exponential(multiplier=1, min=4, max=10),
        before_sleep=after_retry
    )

@create_retry_decorator()
def make_api_call(client, messages, model):
    return client.chat.completions.create(
        messages=messages,
        model=model,
        stream=False,
    )

def summarize(source_text):
    client = openai.OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
    messages = [
        {
            "role": "system",
            "content": """
            You are an AI assistant designed to review pull requests (PRs) in GitHub repositories. Your task is to:

            1. Summarize Code-related Files:
            - Focus on key changes in the code, including additions, deletions, and modifications.
            - Capture essential details such as the purpose of the code, any new functions, classes, or methods, and the overall impact of these changes on the project.
            - Highlight any dependencies, error handling, or performance implications.

            2. Summarize Markdown Files:
            - Extract key points from documentation, readme files, and other markdown content.
            - Identify sections related to project setup, usage instructions, change logs, or contributor guidelines.
            - Note updates in the documentation and the implications for users or developers.
            """,
        },
        {
            "role": "user",
            "content": source_text,
        }
    ]
    chat_completion = make_api_call(client, messages, MODEL_NAME)
    return chat_completion.choices[0].message.content

def qgen(source_text):
    client = openai.OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
    messages = [
        {
            "role": "system",
            "content": "Respond with a list of 10 questions. The text in the user message must contain specific answers to each question. Each question must be on its own line. Just list the questions without any introductory text or numbers.",
        },
        {
            "role": "user",
            "content": source_text,
        }
    ]
    chat_completion = make_api_call(client, messages, MODEL_NAME)
    return chat_completion.choices[0].message.content

def agen(source_text, question):
    client = openai.OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
    messages = [
        {
            "role": "system",
            "content": "Give a comprehensive and well-reasoned answer to the user question strictly based on the context below and try to give a detailed explanation while answering the questions. Also try to add some bonus tip to in each answer and some relevant example outside of the content.\n" + source_text
        },
        {
            "role": "user",
            "content": question,
        }
    ]
    chat_completion = make_api_call(client, messages, MODEL_NAME)
    return chat_completion.choices[0].message.content

def process_row(row, csv_writer, processed_contents, row_count):
    try:
        main_content = row[0]

        if main_content in processed_contents:
            print(f"Skipping row because content has already been processed")
            return row_count, 0

        if len(main_content) > 32000:
            print(f"Skipping row {row_count + 1}: content exceeds 32000 characters")
            return row_count, 0

        summary = summarize(main_content)
        qs = qgen(main_content)
        qna_list = []
        
        for q in qs.splitlines():
            if len(q.strip()) == 0:
                continue
            answer = agen(main_content, q)
            qna_list.append(f"Q: {q}\nA: {answer}")

        csv_writer.writerow([main_content, f"Summary:\n{summary}"])
        for qna in qna_list:
            csv_writer.writerow([main_content, qna])
        
        processed_contents.add(main_content)
        row_count += 1
        print(f"Processed row {row_count}")
        return row_count, 0

    except ProcessingError as pe:
        print(f"Skipping row {row_count + 1} due to timeout: {str(pe)}")
        return row_count, 1
    except Exception as e:
        print(f"Error processing row {row_count + 1}: {str(e)}")
        return row_count, 1

def load_processed_contents(output_path):
    processed = set()
    if os.path.exists(output_path):
        with open(output_path, 'r', newline='', encoding='utf-8') as outfile:
            csv_reader = csv.reader(outfile)
            for row in csv_reader:
                processed.add(row[0])
    return processed

def run_summarizer(input_path=SUMMARIZER_PARAMS["input_path"], 
                  output_path=SUMMARIZER_PARAMS["output_path"]):
    processed_contents = load_processed_contents(output_path)
    row_count = 0
    skipped_rows = 0

    try:
        with open(input_path, 'r', newline='', encoding='utf-8') as infile, \
             open(output_path, 'a', newline='', encoding='utf-8') as outfile:
            
            csv_reader = csv.reader(infile)
            csv_writer = csv.writer(outfile)

            for row in csv_reader:
                row_count, skipped = process_row(row, csv_writer, processed_contents, row_count)
                skipped_rows += skipped
                outfile.flush()

    except KeyboardInterrupt:
        print("Process interrupted by user. Progress saved.")
    except Exception as e:
        logging.error(f"Unexpected error: {str(e)}")
    finally:
        print(f"Modified data has been written to {output_path}")
        print(f"Total rows summarized: {row_count}")
        print(f"Total rows skipped: {skipped_rows}")

In [None]:
# Run GitHub scraper
run_github_scraper()
# Run summarizer
run_summarizer()