In [None]:
import os
import re
import csv
import json
import time
import aiohttp
import logging
import asyncio
import requests
import threading
from urllib.parse import urlparse
from langchain import PromptTemplate
from langchain_openai import ChatOpenAI
from datetime import datetime, timedelta
from langchain.chains.llm import LLMChain
from dotenv import load_dotenv, find_dotenv
from langchain.tools.base import StructuredTool
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableBranch
from langchain.schema.runnable import RunnableLambda
from langchain.memory import ConversationBufferMemory
from langchain.schema.output_parser import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser
from IPython.display import Markdown, display, clear_output
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage

In [None]:
load_dotenv(find_dotenv())

# environment = os.getenv('ENVIRONMENT', 'dev') 
environment = 'prod'

if environment == 'dev':
    logging.basicConfig(
        level=logging.DEBUG, 
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
else:
    logging.basicConfig(
        level=logging.ERROR, 
        format='%(message)s' 
    )

GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [None]:
def get_repo_from_url(github_url):
    parsed_url = urlparse(github_url)
    
    path_parts = parsed_url.path.strip('/').split('/')
    
    if len(path_parts) >= 2:
        owner = path_parts[0]
        repo = path_parts[1]
        return owner, repo
    else:
        logging.error("Invalid GitHub URL")
        return None, None

In [None]:
github_url = "https://github.com/freeCodeCamp/freeCodeCamp"

owner, repo = get_repo_from_url(github_url)
metadata = { "owner": owner, "repo": repo }

chat_model = ChatOpenAI(model="gpt-4o", temperature=0.5)

In [None]:
async def fetch_github_data(endpoint, params=None):
    url = f"https://api.github.com/repos/{metadata['owner']}/{metadata['repo']}/{endpoint}"
  
    headers = {
        "Authorization": f"Bearer {GITHUB_ACCESS_TOKEN}",
        "Accept": "application/vnd.github+json",
        "X-GitHub-Api-Version": "2022-11-28"
    }

    per_page = params.get("per_page", 5) if params else 5

    if params is None:
        params = {}

    params.update({"per_page": per_page})

    logging.info(f"Fetching data from {url} with params: {params}")
    
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers, params=params) as response:
            if response.status != 200:
                logging.error(f"Failed to fetch data: HTTP {response.status}")
                logging.error(f"Response text: {await response.text()}")
                logging.error(f"headers: {headers}")
                return None

            try:
                response_json = await response.json()

                if not response_json:
                    logging.info("No data found")
                    return []

                logging.info(f"Total data fetched: {len(response_json)} items")
                return response_json

            except aiohttp.ClientResponseError as e:
                logging.error("Failed to parse JSON response")
                logging.exception(e)
                return None


In [None]:
async def get_repository():
    url = f"https://api.github.com/repos/{metadata.get('owner')}/{metadata.get('repo')}"
    logging.info(f"Fetching data from {url}")

    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status == 200:
                repository = await response.json()
                return {
                    "name": repository.get("name"),
                    "description": repository.get("description"),
                    "visibility": "private" if repository.get("private") else "public",
                    "owner": repository["owner"].get("login"),
                    "homepage": repository.get("homepage"),
                    "created_at": repository.get("created_at"),
                    "last_pushed": repository.get("pushed_at"),
                    "default_branch": repository.get("default_branch"),
                    "stars": repository.get("stargazers_count"),
                    "watchers": repository.get("watchers_count"),
                    "subscribers": repository.get("subscribers_count"),
                    "forks": repository.get("forks_count"),
                    "open_issues": repository.get("open_issues_count"),
                    "language": repository.get("language"),
                    "license": repository.get("license", {}).get("name"),
                }

            logging.error(f"Failed to fetch data: HTTP {response.status}")
            return {}

In [None]:
async def fetch_github_data(endpoint, params=None):
    url = f"https://api.github.com/repos/{metadata['owner']}/{metadata['repo']}/{endpoint}"
  
    headers = {
        "Authorization": f"Bearer {GITHUB_ACCESS_TOKEN}",
        "Accept": "application/vnd.github+json",
        "X-GitHub-Api-Version": "2022-11-28"
    }

    per_page = params.get("per_page", 10) if params else 10

    if params is None:
        params = {}

    params.update({"per_page": per_page})

    logging.info(f"Fetching data from {url} with params: {params}")
    
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers, params=params) as response:
            if response.status != 200:
                logging.error(f"Failed to fetch data: HTTP {response.status}")
                logging.error(f"Response text: {await response.text()}")
                logging.error(f"headers: {headers}")
                return None

            try:
                response_json = await response.json()

                if not response_json:
                    logging.info("No data found")
                    return []

                logging.info(f"Total data fetched: {len(response_json)} items")
                return response_json

            except aiohttp.ClientResponseError as e:
                logging.error("Failed to parse JSON response")
                logging.exception(e)
                return None


In [None]:
async def get_commits(params=None):
    commit_data = await fetch_github_data("commits", params)

    if not commit_data:
        logging.error("Failed to fetch commits from the GitHub API.")
        return []

    processed_commits = []
    for commit_entry in commit_data:
        commit_details = {
            "message": commit_entry["commit"]["message"],
            "html_url": commit_entry["html_url"],
            "hash": commit_entry["sha"],
            "author_name": commit_entry["commit"]["author"]["name"],
            "commit_date": commit_entry["commit"]["author"]["date"],
        }
        processed_commits.append(commit_details)

    return processed_commits

def generate_commits_markdown(commits=[]):
    markdown_output = ""

    for commit in commits:
        commit_url = commit['html_url']
        commit_title = commit['message'].split('\n')[0]
        commit_hash = commit['hash']
        author_name = commit['author_name']
        commit_date = datetime.fromisoformat(commit['commit_date'].replace('Z', '+00:00'))
        
        author_profile_url = f"https://github.com/{author_name}"
        markdown_output += f"### [{commit_title}]({commit_url})\n\n"
        
        markdown_output += f"![Commit](https://img.shields.io/badge/commit-{commit_hash[:7]}-orange)\n\n"
        
        markdown_output += f"**Author:** [{author_name}]({author_profile_url})  \n"
        markdown_output += f"**Date:** {commit_date.strftime('%Y-%m-%d %H:%M:%S')} UTC  \n"
        
        markdown_output += f"**Hash:** `{commit_hash}`  \n"
        
        full_message = commit['message']
        message_parts = full_message.split('\n\n', 1)
        if len(message_parts) > 1:
            body = message_parts[1].strip()
            markdown_output += f"**Commit Message:** \n `{body}`  \n"
        
        references = re.findall(r'#(\d+)', full_message)
        if references:
            markdown_output += "\n **References:** \n"
            for ref in set(references):  # Use set to remove duplicates
                markdown_output += f"- #{ref}\n"
        
        markdown_output += "\n---\n\n"
    
    return markdown_output
    
# Commit HTML URL Structure - "html_url": "https://github.com/<owner>/<repo>/commit/<commit_hash>"
# Commit title = commit_entry["commit"]["message"].split('\n')[0]

In [None]:
pr_numbers = []

async def get_pull_requests(params=None):
    pull_requests = await fetch_github_data("pulls", params=params)

    if not pull_requests:
        logging.error("Failed to fetch pull requests from the GitHub API.")
        return []

    processed_prs = []
    for pr in pull_requests:
        processed_pr = {
            "pr_title": pr["title"],
            "pr_url": pr["html_url"],
            "pr_number": pr["number"],
            "state": pr["state"],
            "author_name": pr["user"]["login"],
            "author_url": pr["user"]["html_url"],
            "created_at": pr["created_at"],
            "updated_at": pr["updated_at"],
            "closed_at": pr["closed_at"],
            "merged_at": pr["merged_at"],
            "assignee_name": pr["assignee"]["login"] if pr["assignee"] else None,
            "assignee_url": pr["assignee"]["html_url"] if pr["assignee"] else None,
            "requested_reviewers": [
                {"reviewer_name": reviewer["login"], "reviewer_url": reviewer["html_url"]}
                for reviewer in pr["requested_reviewers"]
            ],
            "labels": [label["name"] for label in pr["labels"]],
            "milestone_title": pr["milestone"]["title"] if pr["milestone"] else None,
            "commit_ref": pr["head"]["ref"],
            "base_branch": pr["base"]["ref"],
            "description": pr["body"],
        }
        processed_prs.append(processed_pr)
        pr_numbers.append(pr["number"])
    
    return processed_prs

def generate_pull_requests_markdown(pull_requests=[]):
    markdown_output = ""
    
    for pr in pull_requests:
        markdown_output += f"### [{pr['pr_title']}]({pr['pr_url']}) (#{pr['pr_number']})\n\n"

        status_color = "brightgreen" if pr['state'] == "open" else "red"
        markdown_output += f"![Status](https://img.shields.io/badge/{pr['state']}-{status_color})\n\n"
        
        markdown_output += f"**Author:** [{pr['author_name']}]({pr['author_url']})  \n"
        markdown_output += f"**Created:** {pr['created_at']}  \n"
        markdown_output += f"**Last Updated:** {pr['updated_at']}  \n"
        if pr['closed_at']:
            markdown_output += f"**Closed:** {pr['closed_at']}  \n"
        if pr['merged_at']:
            markdown_output += f"**Merged:** {pr['merged_at']}  \n"
        
        if pr['assignee_name']:
            markdown_output += f"**Assignee:** [{pr['assignee_name']}]({pr['assignee_url']})  \n"
        
        if pr['requested_reviewers']:
            markdown_output += "**Requested Reviewers:**  \n"
            for reviewer in pr['requested_reviewers']:
                markdown_output += f"- [{reviewer['reviewer_name']}]({reviewer['reviewer_url']})  \n"
        
        if pr['labels']:
            markdown_output += "**Labels:** "
            markdown_output += ", ".join(f"`{label}`" for label in pr['labels'])
            markdown_output += "  \n"
        
        if pr['milestone_title']:
            markdown_output += f"**Milestone:** {pr['milestone_title']}  \n"
        
        markdown_output += f"**Branch:** `{pr['commit_ref']}` → `{pr['base_branch']}`  \n"
        
        if pr['description']:
            markdown_output += f"**Description:** \n\n"
            markdown_output += f"{pr['description']}\n\n"
        
        markdown_output += "---\n\n"
    
    return markdown_output


In [None]:
async def fetch_github_count(endpoint, params=None):
    url = f"https://api.github.com/repos/{metadata.get('owner')}/{metadata.get('repo')}/{endpoint}"

    headers = {
        "Authorization": f"Bearer {GITHUB_ACCESS_TOKEN}",
        "Accept": "application/vnd.github+json",
        "X-GitHub-Api-Version": "2022-11-28"
    }

    if params is None:
        params = {}

    params.update({"per_page": 1})

    logging.info(f"Fetching count from {url} with params: {params}")

    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers, params=params) as response:
            if response.status != 200:
                logging.error(f"Failed to fetch count: HTTP {response.status}")
                logging.error(f"Response text: {await response.text()}")
                return None

            if "Link" in response.headers:
                links = response.headers["Link"].split(",")
                for link in links:
                    if 'rel="last"' in link:
                        last_url = link.split(";")[0].strip()[1:-1]
                        last_page = last_url.split("page=")[-1]
                        return int(last_page)

            return len(await response.json())
            

In [None]:
def format_since_datetime(since): 
    if isinstance(since, str) and len(since) > 0:
        duration_mapping = {"h": "hours", "d": "days", "m": "months", "y": "years"}
        time_unit = since[-1]
        time_value = int(since[:-1])
        now = datetime.now()

        # Maybe some special case for today, yesterday

        if time_unit == "h":
            since_date = now - timedelta(hours=time_value)
        elif time_unit == "d":
            since_date = now - timedelta(days=time_value)
        elif time_unit == "m":
            since_date = now - timedelta(days=time_value * 30)
        elif time_unit == "y":
            since_date = now - timedelta(days=time_value * 365)

        return since_date.isoformat() + "Z"

    else:
        return None


In [None]:
async def get_count(action, params):   
    if action == "commits" or action == "pulls" or action == "issues":
        return await fetch_github_count(action, params)

    return "No Response"

async def get_multiple_documents(action, params):
    # if action == "commits" or action == "pulls" or action == "issues":
    if action == "commits":
        commits = await get_commits(params)
        return generate_commits_markdown(commits)

    elif action == "pulls": 
        pulls = await get_pull_requests(params)
        return generate_pull_requests_markdown(pulls)

    return "No Response"

In [None]:
from langchain.tools import tool
from langchain.pydantic_v1 import BaseModel, Field
from enum import Enum
from typing import Optional

async def repository_report_exec() -> dict:
    """
    Fetch the repository report. Always show as a list in markdown and end with a summary.
    """
    repository = await get_repository()
    return {
        "data": repository
    }

repository_report_tool = StructuredTool.from_function(
    func=repository_report_exec,
    name="repository_report_tool",
    coroutine=repository_report_exec
)

In [None]:
class EndpointEnum(str, Enum):
    commits = "commits"
    pulls = "pulls"
    issues = "issues"

class ScopeEnum(str, Enum):
    count = "count"
    single_document = "single_document"
    multiple_documents = "multiple_documents"

class StateEnum(str, Enum):
    open = "open"
    closed = "closed"
    all = "all"

class GithubToolInput(BaseModel):
    endpoint: Optional[EndpointEnum] = Field(None, description="Type of action ('commits', 'pulls', 'issues').")
    scope: Optional[ScopeEnum] = Field(None, description="Scope of the query ('count', 'single_document', 'multiple_documents').")
    state: Optional[StateEnum] = Field(None, description="State of the pull requests/issues ('open', 'closed', 'all').")
    limit: Optional[int] = Field(None, description="Maximum number of items to retrieve.")
    since: Optional[str] = Field(None, description="Date range since format '{{number}}{{h/d/m/y}}' (e.g., '7d' for last 7 days).")
    author: Optional[str] = Field(None, description="Author's username to filter items.")

async def github_exec(args: GithubToolInput) -> dict:
    """
    This tool interacts with the GitHub API to retrieve various types of data from a repository. 
    It can fetch information about commits, pull requests, or issues, and can either count items, retrieve multiple items, or get details of a single item.
    (To use this tool, query must be classified as per the description of the function arguments.)
    """

    action = args.get("action")
    scope  = args.get("scope") 
    state  = args.get("state") 
    limit  = args.get("limit") 
    since  = args.get("since") 
    author  = args.get("author") 
    
    since = format_since_datetime(since)   
    per_page = min(10, int(limit)) if limit is not None else 10

    params = {}
    if isinstance(author, str):
        params["author"] = author
    if isinstance(state, str):
        params["state"] = state
    if isinstance(since, str):
        params["since"] = since

    if scope == "count":
        count = await get_count(action, params)
        return { "data": count }

    params["per_page"] = per_page

    if scope == "multiple_documents":
        docs = await get_multiple_documents(action, params)
        return { "data": docs }

    return { "data": None }

github_tool = StructuredTool.from_function(
    func=github_exec,
    name="github_tool",
    args_schema=GithubToolInput,
    coroutine=github_exec
)

In [None]:
class Agent:
    def __init__(self, system=""):
        self.system = system
        self.messages = []
        if self.system:
            self.messages.append({"role": "system", "content": system})

    def __call__(self, message):
        self.messages.append({"role": "user", "content": message})
        result = self.execute()
        self.messages.append({"role": "assistant", "content": result})
        return result

    def execute(self):
        response = chat_model.invoke(self.messages)
        return response.content


In [None]:
tools = [repository_report_tool, github_tool]
tools_details = ""
tool_names = ["repository_report_tool", "github_tool"]

for tool in tools:
    
    tools_details += f"Tool Name: {tool.name}\n"
    tools_details += f"Tool description: {tool.description}\n"

    tool_args = tool.args_schema.__fields__.items()
    if len(tool_args) > 0:
        tools_details += f"Arguments:\n"
    else:
        tools_details += f"No Arguments\n"
    
    for field_name, field_info in tool_args:
        tools_details += f" - {field_name}: {field_info}\n"
        tools_details += f"   Description: {field_info.field_info.description}\n"

    tools_details += "\n\n";
    
github_agent_prompt=f"""
You are a GitHub Repository Analysis Agent named GitBot. 
Answer the following questions as best you can on Github Repository: {github_url}. 
You have access to the following tools:

{tools_details}

Use the following format:
    Question: the input question you must answer
    Thought: you should always think about what to do
    Action: the action to take, should be one of [{tool_names}]. just mention the tool_name. If you are dependent on action, Stop Observing. 
    Action Input: if Action is github_tool, do <Query Classification> (JSON format) PAUSE. otherwise skip.
    Observation: the result of the action
    ... (this Thought/Action/Action Input/Observation can repeat N times)
    Thought: I now know the final answer
    Final Answer: the final answer to the original input question

    If there is no Action to execute. Then output only the Final Answer. 
    Final output should be precise on the given query. Don't make one understand your internal step.
    Don't mention "Final Answer" in the output

    Begin!
"""

def parse_and_validate_output(output_string):
    result = {
        "action": None,
        "endpoint": None,
        "scope": None,
        "state": None,
        "since": None,
        "limit": None,
        "author": None
    }

    action_match = re.search(r"Action:\s*(\w+)", output_string)
    if action_match:
        result["action"] = action_match.group(1)

    # Looks buggy
    action_input_match = re.search(r"Action Input:\s*({.*?})\s*(?:PAUSE)?", output_string, re.DOTALL)

    if action_input_match:
        try:
            json_string = action_input_match.group(1).strip()  
            print(f"Extracted JSON: {json_string}")  
            
            action_input = json.loads(json_string)
            print("Parsed JSON:", action_input) 
            
            for key in result.keys():
                if key in action_input:
                    value = action_input[key]
                    if value == "None":
                        result[key] = None
                    elif key == "limit" and value is not None:
                        result[key] = int(value)
                    else:
                        result[key] = value
                        
        except json.JSONDecodeError as e:
            print("Error: Unable to parse Action Input JSON")
            print(f"JSONDecodeError details: {str(e)}")

    return result

query = Agent(github_agent_prompt)

async def chat():  
    while True:
        user_input = input("User: ")
    
        if user_input.lower() in ["exit", "quit", "q"]:
            print("Goodbye!")
            break
    
        next_prompt = user_input
        result = ""
    
        max_iteration = 5
        while max_iteration != 0:
            max_iteration -= 1
            
            result = query(next_prompt)

            print("=========================")
            print(result)
            parser = parse_and_validate_output(result)
            print("=========================")
            
            if parser["action"] == "github_tool":
                print(">>>>>>>> Exploring github Api...")
                
                data = await github_tool.coroutine({
                        "action" : parser.get("endpoint"), 
                        "scope"  : parser.get("scope"), 
                        "state" : parser.get("state"), 
                        "limit"  : parser.get("limit"), 
                        "since"  : parser.get("since"), 
                        "author" : parser.get("author")
                })
                next_prompt = json.dumps(data)
    
            elif parser["action"] == "repository_report_tool":
                print(">>>>>>>>> Preparing repository report...")

                data = await repository_report_tool.coroutine()
                next_prompt = json.dumps(data)
    
            else:
                print(">>>>>>>>> Actions are completed...")
                break
    
        display(Markdown(f"**Git-Bot:** \n {result}"))

await chat()