In [None]:
import os
import csv
import json
import time
import logging
import requests
from urllib.parse import urlparse
from dotenv import load_dotenv, find_dotenv

In [None]:
environment = 'dev'
load_dotenv(find_dotenv())

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [None]:
def get_repo_from_url(github_url):
    parsed_url = urlparse(github_url)
    
    path_parts = parsed_url.path.strip('/').split('/')
    
    if len(path_parts) >= 2:
        owner = path_parts[0]
        repo = path_parts[1]
        return owner, repo
    else:
        logging.error("Invalid GitHub URL")
        return None, None

In [None]:
github_url = "https://github.com/openai/openai-python"

owner, repo = get_repo_from_url(github_url)

metadata = { "owner": owner, "repo": repo }
print(json.dumps(metadata))

In [None]:
def get_repository():
    url = f"https://api.github.com/repos/{metadata['owner']}/{metadata['repo']}"
    logging.info(f"Fetching data from {url}")
    
    response = requests.get(url)
    
    if response.status_code == 200:
        repository = response.json()
        
        return {
            "name": repository.get("name"),
            "is_private_repo": repository.get("private"),
            
            "owner": repository["owner"].get("login"),
            "homepage": repository.get("homepage"),
            
            "created_at": repository.get("created_at"),
            "updated_at": repository.get("updated_at"),
            "pushed_at": repository.get("pushed_at"),
            "default_branch": repository.get("default_branch"),
            
            "stars_count": repository.get("stargazers_count"),
            "watchers_count": repository.get("watchers_count"),
            "subscribers_count": repository.get("subscribers_count"),
            "forks_count": repository.get("forks_count"),
            "open_issues_count": repository.get("open_issues_count"),
            
            "has_issues": repository.get("has_issues"),
            "has_discussions": repository.get("has_discussions"),
            
            "language": repository.get("language"),
            "license": repository.get("license", {}).get("name"),
        }
        
    logging.error(f"Failed to fetch data: HTTP {response.status_code}")
    logging.error(f"Response text: {response.text}")
    return {}

In [None]:
start_time = time.time()

# repository = get_repository()
# print(json.dumps(repository, indent=4))

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken to fetch repository data: {elapsed_time:.4f} seconds")

In [None]:
def fetch_github_data(endpoint, params=None):
    url = f"https://api.github.com/repos/{metadata['owner']}/{metadata['repo']}/{endpoint}"
  
    headers = {
        "Authorization": f"Bearer {GITHUB_ACCESS_TOKEN}",
        "Accept": "application/vnd.github+json",
        "X-GitHub-Api-Version": "2022-11-28"
    }

    responses = []
    page = 1
    per_page = 100  
    
    while True:
        if environment == 'dev' and page > 10:
            break
        
        if params is None:
            params = {}
            
        params.update({"page": page, "per_page": per_page})
        
        logging.info(f"Fetching data from {url} with params: {params}")
        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            logging.error(f"Failed to fetch data: HTTP {response.status_code}")
            logging.error(f"Response text: {response.text}")
            logging.error(f"headers: {headers}")
            return None
        
        try:
            response = response.json()
            if not response: 
                break

            responses.extend(response)
            page += 1

            if len(response) != 100:
                break
                
        except requests.JSONDecodeError as e:
            logging.error("Failed to parse JSON response")
            logging.exception(e)
            return None

    logging.info(f"Total data fetched: {len(responses)} items")
    return responses

In [None]:
def get_commits():
    commit_data = fetch_github_data("commits")

    if not commit_data:
        logging.error("Failed to fetch commits from the GitHub API.")
        return []

    processed_commits = []
    for commit_entry in commit_data:
        commit_details = {
            "hash": commit_entry["sha"],
            "message": commit_entry["commit"]["message"],
            "author_name": commit_entry["commit"]["author"]["name"],
            "author_email": commit_entry["commit"]["author"]["email"],
            "commit_date": commit_entry["commit"]["author"]["date"]
        }
        processed_commits.append(commit_details)

    return processed_commits

# print(json.dumps(get_commits(), indent=4))

# Commit HTML URL Structure - "html_url": "https://github.com/<owner>/<repo>/commit/<commit_hash>"
# Commit title = commit_entry["commit"]["message"].split('\n')[0]

In [None]:
# Every pull request is an issue, but not every issue is a pull request. For this reason, "shared" actions for both features.

# 1. The node ID typically starts with PR_, signaling it's a pull request. The node ID typically starts with I_, signaling it's an issue.
# 2. Both pull requests and issues have a timeline_url to track the events associated with them (comments, status changes, etc.).
# 3. The html_url contains /pull/ in the path, indicating it is a pull request. The html_url contains /issues/, indicating it is an issue.
# 4. Presence of the pull_request key. pull_request should be None for issues and should exist for pull requests

def get_issues_and_pull_requests(state='all'):
    issues_data = fetch_github_data(endpoint="issues", params={"state": state})

    if not issues_data:
        logging.error("Failed to fetch issues from the GitHub API.")
        return []

    processed_issues = []
    for issue in issues_data:
        processed_issues.append({
            "number": issue["number"],
            "title": issue["title"],
            "html_url": issue["html_url"],
            "state": issue["state"],
            "state_reason": issue["state_reason"],
            "created_at": issue["created_at"],
            "updated_at": issue["updated_at"],
            "closed_at": issue.get("closed_at"),
            "author_name": issue["user"]["login"],
            "closed_by": issue.get("closed_by", {}).get("login"),
            "labels": [label["name"] for label in issue["labels"]],
            "assignee_name": issue["assignee"]["login"] if issue.get("assignee") else None,
            "description": issue["body"],
            "reactions": issue["reactions"],
            "comments": issue["comments"],
            "locked": issue["locked"]
        })

    return processed_issues

# html_url = https://github.com/<owner>/<repo>/<issues or pull>/<number>
# author_url = https://github.com/<author_name>
# pull_request = https://github.com/<owner>/<repo>/pull/<number>

In [None]:
start_time = time.time()

# issues_and_prs = get_issues_and_pull_requests()
# print(json.dumps(issues_and_prs, indent = 4))

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken to fetch issues and pull requests: {elapsed_time:.4f} seconds")

### TODO

1. Work on getting the current version.
2. Get owner and repo from metadata. Not from function args.
3. Find how many files are updated in a commit.
4. Issue and PR details api call "on_demand".
5. Issue and PR timeline api call "on_demand".
6. Branch details on an commit/pr.


In [None]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

model = ChatOpenAI(model="gpt-4o")

result = model.invoke("What is 81 divided by 9?")
print("Full result:")
print(result)
print("Content only:")
print(result.content)

In [None]:
# 


In [None]:
pr_numbers = []

def fetch_and_process_pull_requests(owner, repo):
    pull_requests = fetch_github_data(endpoint="pulls", repo=f"{owner}/{repo}")

    processed_prs = []
    for pr in pull_requests: 
        processed_prs.append({
            "pr_title": pr["title"],
            "pr_url": pr["html_url"],
            "pr_number": pr["number"],
            "state": pr["state"],
            "author_name": pr["user"]["login"],
            "author_url": pr["user"]["html_url"],
            "created_at": pr["created_at"],
            "updated_at": pr["updated_at"],
            "closed_at": pr["closed_at"],
            "merged_at": pr["merged_at"],
            "assignee_name": pr["assignee"]["login"] if pr["assignee"] else None,
            "assignee_url": pr["assignee"]["html_url"] if pr["assignee"] else None,
            "requested_reviewers": [
                {"reviewer_name": reviewer["login"], "reviewer_url": reviewer["html_url"]}
                for reviewer in pr["requested_reviewers"]
            ],
            "labels": [label["name"] for label in pr["labels"]],
            "milestone_title": pr["milestone"]["title"] if pr["milestone"] else None,
            "commit_sha": pr["head"]["sha"],
            "commit_ref": pr["head"]["ref"],
            "base_branch": pr["base"]["ref"],
            "description": pr["body"],
            "review_comments_url": pr["review_comments_url"],
            "commits_url": pr["commits_url"],
            "statuses_url": pr["statuses_url"],
            "pr_author_association": pr["author_association"],
            "__main_from_pr_list": pr
        })
        pr_numbers.append(pr["number"])
    
    return processed_prs

def get_pull_requests(owner, repo):
    return get_data_with_backup(f"{owner}_{repo}_pull_requests.csv", lambda: fetch_and_process_pull_requests(owner, repo))

# pull_requests = get_pull_requests(owner, repo)


In [None]:
def extract_review_comment_data(review_comment):
    return {
        "comment_url": review_comment["url"],
        "review_id": review_comment["pull_request_review_id"],
        "comment_id": review_comment["id"],
        "file_path": review_comment["path"],
        "comment_body": review_comment["body"],
        "author_login": review_comment["user"]["login"],
        "author_url": review_comment["user"]["html_url"],
        "created_at": review_comment["created_at"],
        "updated_at": review_comment["updated_at"],
        "pr_url": review_comment["pull_request_url"]
    }

def extract_conversation_data(conversation):
    return {
        "comment_url": conversation["url"],
        "html_url": conversation["html_url"],
        "author_login": conversation["user"]["login"],
        "author_url": conversation["user"]["html_url"],
        "created_at": conversation["created_at"],
        "updated_at": conversation["updated_at"],
        "comment_body": conversation["body"]
    }

def get_pr_review_comments(owner, repo, pull_number):
    review_comments = fetch_github_data(endpoint=f"pulls/{pull_number}/comments", repo=f"{owner}/{repo}")
    if review_comments is not None:
        return [extract_review_comment_data(comment) for comment in review_comments]
    else:
        return []

def get_pr_conversations(owner, repo, pull_number):
    conversations = fetch_github_data(endpoint=f"issues/{pull_number}/comments", repo=f"{owner}/{repo}")
    if conversations is not None:
        return [extract_conversation_data(conversation) for conversation in conversations]
    else:
        return []
        

In [None]:
def get_all_pr_data(owner, repo, pr_numbers):
    pr_review_comments = []
    pr_conversations = []

    for pr_number in pr_numbers:
        pr_review_comments.extend(get_pr_review_comments(owner, repo, pr_number))
        pr_conversations.extend(get_pr_conversations(owner, repo, pr_number))

    return pr_review_comments, pr_conversations

def get_pr_messages(owner, repo):
    pr_review_comments = []
    pr_conversations = []
    
    review_comments_file = f"{owner}_{repo}_pr_review_comments.csv"
    pr_conversations_file = f"{owner}_{repo}_pr_conversations.csv"

    is_review_comments_file_exists = os.path.exists(os.path.join(DATA_DIRECTORY, review_comments_file))
    is_pr_conversations_file_exists = os.path.exists(os.path.join(DATA_DIRECTORY, pr_conversations_file))

    if is_review_comments_file_exists and is_pr_conversations_file_exists and is_backup_required:
        logging.info(f"File '{review_comments_file}' exists. Reading data from the file.")
        pr_review_comments = read_csv_to_array(review_comments_file)
        pr_conversations = read_csv_to_array(pr_conversations_file)
    else:
        logging.info(f"File '{review_comments_file}' does not exist. Fetching data from the GitHub API.")
        pr_review_comments, pr_conversations = get_all_pr_data(owner, repo, pr_numbers)
        create_csv_file(pr_review_comments, review_comments_file)
        create_csv_file(pr_conversations, pr_conversations_file)
        logging.info(f"{review_comments_file} and {pr_conversations_file} created with backup data.")
    
    return pr_review_comments, pr_conversations
    

In [None]:
# pr_review_comments, pr_conversations = get_pr_messages(owner,repo)

In [None]:
# print(json.dumps(pr_review_comments, indent = 4))

In [None]:
# print(json.dumps(pr_conversations, indent = 4))

Every pull request is an issue, but not every issue is a pull request. For this reason, "shared" actions for both features, like managing assignees, labels, and milestones, are provided within the Issues endpoints.


1. The node ID typically starts with PR*, signaling it's a pull request. The node ID typically starts with I*, signaling it's an issue.
2. Both pull requests and issues have a timeline_url to track the events associated with them (comments, status changes, etc.).
3. The html_url contains /pull/ in the path, indicating it is a pull request. The html_url contains /issues/, indicating it is an issue.
4. Presence of the `pull_request` key. `pull_request` should be `None` for issues and should exist for pull requests


In [None]:
from datetime import datetime

def process_timeline_with_metadata(timeline):
    processed_events = []

    for event in timeline:
        processed_event = {
            "event_type": event.get("event"),
            "actor_login": event["actor"]["login"] if event.get("actor") else None,
            "actor_id": event["actor"]["id"] if event.get("actor") else None,
            "actor_avatar_url": event["actor"]["avatar_url"] if event.get("actor") else None,
            "timestamp": datetime.strptime(event["created_at"], "%Y-%m-%dT%H:%M:%SZ"),
            "event_url": event.get("url"),
            "performed_via_github_app": event.get("performed_via_github_app", False)
        }

        if event.get("event") == "commented":
            processed_event['metadata'] = {
                "comment_body": event.get("body"),
                "comment_id": event.get("id")
            }
        
        elif event.get("event") == "labeled":
            processed_event['metadata'] = {
                "label_name": event.get("label", {}).get("name"),
                "label_color": event.get("label", {}).get("color")
            }
        
        elif event.get("event") == "cross-referenced":
            source_issue = event.get("source", {}).get("issue", {})
            pull_request = source_issue.get("pull_request")
            
            if pull_request:
                processed_event['metadata'] = {
                    "source_issue_url": source_issue.get("html_url"),
                    "pull_request_url": pull_request.get("html_url"),
                    "pull_request_number": source_issue.get("number"),
                    "repository": source_issue.get("body", ""),
                    "source_issue": source_issue
                }
        
        elif event.get("event") == "assigned":
            processed_event['metadata'] = {
                "assignee": event.get("assignee", {}).get("login"),
                "assigner": event.get("assigner", {}).get("login")
            }
        
        elif event.get("event") == "unassigned":
            processed_event['metadata'] = {
                "unassignee": event.get("assignee", {}).get("login"),
                "unassigner": event.get("assigner", {}).get("login")
            }
        
        elif event.get("event") == "closed":
            processed_event['metadata'] = {
                "closed_by": event.get("actor", {}).get("login"),
                "commit_id": event.get("commit_id"),
                "commit_url": event.get("commit_url")
            }
        
        elif event.get("event") == "reopened":
            processed_event['metadata'] = {
                "reopened_by": event.get("actor", {}).get("login"),
                "created_at": event.get("created_at")
            }
        
        elif event.get("event") == "merged":
            processed_event['metadata'] = {
                "merged_by": event.get("actor", {}).get("login"),
                "commit_id": event.get("commit_id"),
                "commit_url": event.get("commit_url")
            }
        
        elif event.get("event") == "milestoned":
            processed_event['metadata'] = {
                "milestone_title": event.get("milestone", {}).get("title"),
                "actor": event.get("actor", {}).get("login")
            }
        
        elif event.get("event") == "review_requested":
            processed_event['metadata'] = {
                "reviewer": event.get("requested_reviewer", {}).get("login"),
                "requested_by": event.get("review_requester", {}).get("login")
            }

        elif event.get("event") == "connected":
            processed_event['metadata'] = {
                "issue_url": event.get("source", {}).get("issue", {}).get("html_url"),
                "pull_request_url": event.get("source", {}).get("pull_request", {}).get("html_url")
            }

        processed_events.append(processed_event)

    processed_events.sort(key=lambda x: x["timestamp"])
    return processed_events


# timeline = process_timeline_with_metadata(fetch_github_data(endpoint="issues/42256/timeline", repo=f"{owner}/{repo}"))
# print(json.dumps(timeline, indent=4, sort_keys=True, default=str))

In [None]:
# import requests
# import json

# GITHUB_TOKEN = GITHUB_ACCESS_TOKEN
# GITHUB_API_URL = "https://api.github.com/graphql"

# def get_issue_timeline(owner, repo, issue_number):
#     timeline_items = []
#     has_next_page = True
#     cursor = None
    
#     while has_next_page:
#         query = f"""
#         {{
#           repository(owner: "{owner}", name: "{repo}") {{
#             issue(number: {issue_number}) {{
#               title
#               body
#               comments(first: 100) {{
#                 nodes {{
#                   bodyText
#                   author {{
#                     login
#                   }}
#                 }}
#               }}
#               timelineItems(first: 100{', after: "' + cursor + '"' if cursor else ''}) {{
#                 pageInfo {{
#                   hasNextPage
#                   endCursor
#                 }}
#                 nodes {{
#                   __typename
#                   ... on ClosedEvent {{
#                     id
#                     createdAt
#                     url
#                     closer {{
#                       __typename
#                       ... on Commit {{
#                         message
#                         url
#                       }}
#                       ... on PullRequest {{
#                         number
#                         title
#                         url
#                       }}
#                     }}
#                   }}
#                   ... on CrossReferencedEvent {{
#                     actor {{
#                       login
#                     }}
#                     createdAt
#                     id
#                     isCrossRepository
#                     referencedAt
#                     resourcePath
#                     url
#                     willCloseTarget
#                     source {{
#                       __typename
#                       ... on PullRequest {{
#                         number
#                         title
#                         url
#                       }}
#                       ... on Issue {{
#                         number
#                         title
#                         url
#                       }}
#                     }}
#                     target {{
#                       __typename
#                       ... on PullRequest {{
#                         number
#                         title
#                         url
#                       }}
#                       ... on Issue {{
#                         number
#                         title
#                         url
#                       }}
#                     }}
#                   }}
#                 }}
#               }}
#             }}
#           }}
#         }}
#         """

#         headers = {
#             "Authorization": f"Bearer {GITHUB_TOKEN}"
#         }

#         response = requests.post(GITHUB_API_URL, json={"query": query}, headers=headers)

#         if response.status_code == 200:
#             data = response.json()
#             issue_data = data["data"]["repository"]["issue"]
#             timeline_data = issue_data["timelineItems"]
#             timeline_items.extend(timeline_data["nodes"])  # Collect the nodes
            
#             page_info = timeline_data["pageInfo"]
#             has_next_page = page_info["hasNextPage"]
#             cursor = page_info["endCursor"]
            
#         else:
#             print(f"Query failed with status code {response.status_code}: {response.text}")
#             break

#     return timeline_items

# owner = "freeCodeCamp"
# repo = "freeCodeCamp"
# issue_number = 42256

# timeline_items = get_issue_timeline(owner, repo, issue_number)
# print(json.dumps(timeline_items, indent=4))


In [None]:
# !pip install openai

# import os
# from openai import OpenAI

# client = OpenAI(
#     # This is the default and can be omitted
#     api_key="",
# )

# chat_completion = client.chat.completions.create(
#     messages=[
#         {
#             "role": "user",
#             "content": "Say this is a test",
#         }
#     ],
#     model="gpt-3.5-turbo",
# )

# print(chat_completion)

In [None]:
# DATA_DIRECTORY = "data"
# os.makedirs(DATA_DIRECTORY, exist_ok=True)

# def create_csv_file(data_list, file_name):
#     file_path = os.path.join(DATA_DIRECTORY, file_name)
#     with open(file_path, mode='w', newline='') as csv_file:
#         csv_writer = csv.DictWriter(csv_file, fieldnames=data_list[0].keys())
#         csv_writer.writeheader()
#         csv_writer.writerows(data_list)
#     logging.info(f"CSV file '{file_path}' created successfully.")

# def read_csv_to_array(file_name):
#     data_list = []
#     file_path = os.path.join(DATA_DIRECTORY, file_name)
    
#     try:
#         with open(file_path, mode='r') as csv_file:
#             csv_reader = csv.DictReader(csv_file)
#             for row in csv_reader:
#                 data_list.append(row)
#         logging.info(f"Data successfully read from '{file_path}' into array.")
#     except FileNotFoundError:
#         logging.error(f"File '{file_path}' not found.")
#     except Exception as error:
#         logging.error(f"An error occurred while reading '{file_path}': {error}")
    
#     return data_list

# def get_data_with_backup(filename, cb):
#     csv_path = os.path.join(DATA_DIRECTORY, filename)
    
#     if os.path.exists(csv_path) and is_backup_required:
#         logging.info(f"File '{csv_path}' exists. Reading data from the file.")
#         return read_csv_to_array(filename)
#     else:
#         logging.info(f"File '{csv_path}' does not exist. Executing GitHub API.")
#         data = cb()
#         if data:
#             create_csv_file(data, filename)
#         return data

In [None]:
# !pyenv install 3.12.4
# !pyenv local 3.12.4
# !python -m venv .venv
# !source .venv/bin/activate

In [None]:
# github_url = "https://github.com/apache/kafka"
# github_url = "https://github.com/midday-ai/v1"
# github_url = "https://github.com/midday-ai/midday"
# github_url = "https://github.com/LetsTrie/Grind-75"
# github_url = "https://github.com/jonogon/jonogon-mono"
# github_url = "https://github.com/OptimalBits/bull"
# github_url = "https://github.com/freeCodeCamp/freeCodeCamp"