In [None]:
repository_name = 'tensorflow/tensorflow'
# repository_name = 'zero-ai-prs'
repository_created_at = "2024-11-08T05:07:28"

In [None]:
import os

parsed_repo_name = repository_name.replace("/", "__")
os.makedirs(f"../datasets/{parsed_repo_name}", exist_ok=True)
pkl_filename = f'../datasets/{parsed_repo_name}/{parsed_repo_name}-progress.pkl'

In [None]:
import requests
import json
import itertools
import datetime
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import pandas as pd
import random
from tqdm import tqdm
import time

# Read tokens from a text file
tokens_file = "./env/tokens.txt"
with open(tokens_file, "r") as file:
    tokens = file.read().splitlines()

# Choose a random start index
start_index = random.randint(0, len(tokens) - 1)

# Rotate the tokens list starting at a random position
rotated_tokens = tokens[start_index:] + tokens[:start_index]

# Create an infinite cycle iterator from the rotated list
token_iterator = itertools.cycle(rotated_tokens)

# Get the first token
current_token = next(token_iterator)

In [None]:
# List of User-Agents for randomization
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

# Define headers to authenticate using the first token
headers = {
    "Authorization": f"Bearer {current_token}",
    "User-Agent": random.choice(user_agents),
}

# Setup GraphQL endpoint and client
graphql_url = "https://api.github.com/graphql"
transport = RequestsHTTPTransport(url=graphql_url, headers=headers, use_json=True)
client = Client(transport=transport, fetch_schema_from_transport=True)

In [None]:
def log_activity(activity: str):
    log = f"{datetime.datetime.now()}: {activity}\n"
    # print(log)
    with open(f"../datasets/{parsed_repo_name}/{parsed_repo_name}-output.log", "a") as log_file:
        log_file.write(log)

In [None]:
# Test all tokens to verify their validity
def test_all_tokens():
    test_query = gql(
        """
        {
          viewer {
            login
          }
        }
        """
    )
    for i, token in enumerate(rotated_tokens):
        headers = {
            "Authorization": f"Bearer {token}",
            "User-Agent": random.choice(user_agents),
        }
        transport = RequestsHTTPTransport(
            url=graphql_url, headers=headers, use_json=True
        )
        client = Client(transport=transport, fetch_schema_from_transport=True)

        try:
            response = client.execute(test_query)
            log_activity(
                f"Token {i+1}/{len(rotated_tokens)} is valid. Logged in as: {response['viewer']['login']}"
    )
        except Exception as e:
            log_activity(f"Token {i+1}/{len(rotated_tokens)} failed with error: {e}")


# Run the token validation
test_all_tokens()

In [None]:
# Define the GraphQL query
query_template = gql(
    """
    query searchIssues($keyword: String!, $afterCursor: String, $first: Int) {
      search(query: $keyword, type: ISSUE, first: $first, after: $afterCursor) {
        issueCount
        edges {
          cursor
          node {
            ... on PullRequest {
              id
              number
              title
              url
              comments {
                totalCount
              }
              state
              closed
              merged
              createdAt
              updatedAt
              closedAt
              deletions
              mergeCommit {
                oid
              }
              timeline {
                totalCount
              }
              commits {
                totalCount
              }
              changedFiles
              headRefName
              baseRefName
              repository {
                id
                nameWithOwner
                stargazerCount
                description
                codeOfConduct {
                  body
                  id
                  name
                  url
                }
                homepageUrl
                assignableUsers {
                  totalCount
                }
                mentionableUsers {
                  totalCount
                }
                forkCount
                watchers {
                  totalCount
                }
                isFork
                languages(first: 20) {
                  edges {
                    node {
                      name
                    }
                  }
                }
              }
              author {
                ... on User {
                  login
                  url
                  createdAt
                  repositories {
                    totalCount
                  }
                  followers {
                    totalCount
                  }
                  following {
                    totalCount
                  }
                  repositoryDiscussions {
                    totalCount
                  }
                  repositoryDiscussionComments {
                    totalCount
                  }
                  organizations (first: 20){
                    edges {
                      node {
                        name
                        login
                        url
                        membersWithRole {
                          totalCount
                        }
                      }
                    }
                  }
                }
              }
              labels(first: 10) {
                edges {
                  node {
                    name
                  }
                }
              }
              body
              bodyHTML
              bodyText
            }
          }
        }
        pageInfo {
          endCursor
          hasNextPage
        }
      }
    }
    """
)

In [None]:
transport.headers = headers
# Check rate limit before executing the main query
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)
rate_limit_response = client.execute(rate_limit_query)
log_activity(f"Rate limit: {rate_limit_response['rateLimit']}")

In [None]:
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)


def execute_query(keyword, first=100, after_cursor=None):
    global current_token
    log_activity(
        f"Executing query with keyword: {keyword}, first: {first}, afterCursor: {after_cursor}"
    )
    while True:
        try:
            # Randomize User-Agent for each query
            headers["User-Agent"] = random.choice(user_agents)
            transport.headers = headers
            # Check rate limit before executing the main query
            rate_limit_response = client.execute(rate_limit_query)
            remaining = rate_limit_response["rateLimit"]["remaining"]
            if remaining < 100:
                log_activity(
                    f"Rate limit remaining ({remaining}) is below threshold. Switching token..."
                )
                # Set up to track whether we have cycled through all tokens
                all_tokens_checked = False
                initial_token = current_token

                while not all_tokens_checked:
                    # Switch to the next token
                    current_token = next(token_iterator)
                    headers["Authorization"] = f"Bearer {current_token}"
                    transport.headers = headers

                    # Check the rate limit of the new token
                    rate_limit_response = client.execute(rate_limit_query)
                    remaining = rate_limit_response["rateLimit"]["remaining"]

                    if remaining >= 100:
                        log_activity(
                            f"Switched to a new token with sufficient rate limit ({remaining} remaining)."
                        )
                        break

                    # Check if we have cycled through all tokens
                    if current_token == initial_token:
                        log_activity("All tokens are below threshold. Waiting for 1 hour...")
                        time.sleep(3600)
                        all_tokens_checked = True

                continue
            return client.execute(
                query_template,
                variable_values={
                    "keyword": keyword,
                    "first": first,
                    "afterCursor": after_cursor,
                },
            )
        except Exception as e:
            if "API rate limit" in str(e):
                log_activity(
                    f"Rate limit reached: {e}, switching token... (Attempt with first {first})"
                )
                current_token = next(token_iterator)
                headers["Authorization"] = f"Bearer {current_token}"
            else:
                if first > 1:
                    first = max(1, first // 2)
                    log_activity(
                    f"Error: {e}, reducing number of results and retrying... (Attempt with first {first})"
                    )
                else:
                    log_activity(f"Query failed completely after retries: {e}")
                    break
    log_activity("Max retries reached. Sleeping for 30 minutes and switching token...")
    time.sleep(1800)
    current_token = next(token_iterator)
    headers["Authorization"] = f"Bearer {current_token}"
    transport.headers = headers
    return execute_query(keyword, first, after_cursor)

In [None]:
import pickle
if os.path.exists(pkl_filename):
    with open(pkl_filename, "rb") as f:
        progress_data = pickle.load(f)
        df = progress_data["df"]
        start_index = progress_data["start_index"]
else:
    df = []
    start_index = 0

In [None]:
# df = []
# start_index = 0

In [None]:
import datetime
import pickle


def execute_with_dynamic_date_range(
    repo_name,
    execute_query,
    process_results,
    start_date_arg,
    se_fm_repository_data,
    max_total_allowed_results=950,
    default_days_interval=60,
):
    """
    Executes a GraphQL query within dynamically adjusted date ranges to handle large datasets.

    :param keywords: List of keywords for search queries.
    :param execute_query: Function to execute the query.
    :param process_results: Function to process the query results.
    :param start_date_arg: Start date in ISO format ("%Y-%m-%dT%H:%M:%S").
    :param max_total_allowed_results: Max allowed results before reducing date range.
    :param default_days_interval: Initial days interval for date range.
    """
    start_date = datetime.datetime.strptime(start_date_arg, "%Y-%m-%dT%H:%M:%S")
    current_date = datetime.datetime.now()
    end_date: datetime.datetime = current_date
    days_interval = default_days_interval

    while start_date < end_date:
        next_date_candidate = start_date + datetime.timedelta(days=days_interval)
        next_date = min(next_date_candidate, end_date)

        try:
            after_cursor = None
            while True:
                date_range = f"{start_date.strftime('%Y-%m-%dT%H:%M')}..{next_date.strftime('%Y-%m-%dT%H:%M')}"
                search_keyword = f"is:pr is:public archived:false created:{date_range} repo:{repo_name}"
                response = execute_query(
                    search_keyword, first=10, after_cursor=after_cursor
                )
                log_activity(f'response count: {response["search"]["issueCount"]}\n')

                if response["search"]["issueCount"] == 0:
                    days_interval = default_days_interval  # Reset interval
                    break

                # Adjust interval if issue count exceeds max allowed
                if response["search"]["issueCount"] > max_total_allowed_results:
                    reduced_interval = (
                        max(1, days_interval // 2)
                        if days_interval > 1
                        else max(0.00069, days_interval / 2)
                    )
                    log_activity(f"Reducing interval to {reduced_interval} days...")
                    days_interval = reduced_interval
                    next_date = start_date + datetime.timedelta(days=days_interval)
                    continue

                # Process results
                process_results(
                    response,
                )

                # Pagination
                page_info = response["search"]["pageInfo"]
                if page_info["hasNextPage"]:
                    after_cursor = page_info["endCursor"]
                else:
                    break
            with open(pkl_filename, "wb") as f:
                pickle.dump(
                    {"df": se_fm_repository_data, "start_index": start_index + 1}, f
                )

            # Reset interval to default after a successful run
            days_interval = default_days_interval
        except Exception as e:
            log_activity(
                f"Error fetching data for '{repo_name}' in range {date_range}: {e}"
            )
            # Save progress before terminating
            with open(pkl_filename, "wb") as f:
                pickle.dump({"df": df, "start_index": start_index}, f)
            raise

        start_date = next_date  # Move to the next date interval

In [None]:
import pandas as pd


def result_processor(
    response,
):
    for edge in response["search"]["edges"]:
        pull_request = edge["node"]

        if not pull_request:
            continue

        author = pull_request["author"]

        author_organizations = (
            [
                organization["node"]
                for organization in author["organizations"]["edges"]
                if organization["node"]
            ]
            if author
            and author.get("organizations")
            and author["organizations"].get("edges")
            else []
        )

        timestamp_suffix = f"_as_at_{datetime.datetime.now().strftime('%Y-%m-%d')}"

        df.append(
            {
                "id": pull_request["id"],
                "title": pull_request["title"],
                "url": pull_request["url"],
                "state": pull_request["state"],
                "comments_count": pull_request["comments"]["totalCount"],
                "deletions": pull_request["deletions"],
                "closed": pull_request["closed"],
                "closed_at": pull_request["closedAt"],
                "merged": pull_request["merged"],
                "body": pull_request["body"],
                "bodyHTML": pull_request["bodyHTML"],
                "bodyText": pull_request["bodyText"],
                "created_at": pull_request["createdAt"],
                "updated_at": pull_request["updatedAt"],
                "repository": pull_request["repository"],
                "repository_name_with_owner": pull_request["repository"][
                    "nameWithOwner"
                ],
                "repository_stargazer_count": pull_request["repository"][
                    "stargazerCount"
                ],
                "repository_watcher_count": pull_request["repository"]["watchers"][
                    "totalCount"
                ],
                "repository_is_fork": pull_request["repository"]["isFork"],
                "repository_languages": [
                    language["node"]["name"]
                    for language in pull_request["repository"]["languages"]["edges"]
                ],
                "merge_commit": (
                    pull_request["mergeCommit"]["oid"]
                    if pull_request["mergeCommit"]
                    else None
                ),
                "labels": [
                    label["node"]["name"] for label in pull_request["labels"]["edges"]
                ],
                "commits_count": pull_request["commits"]["totalCount"],
                "changed_files_count": pull_request["changedFiles"],
                "author_name": (author["login"] if author else None),
                "author_url": (author["url"] if author else None),
                "author_account_created_at": (author["createdAt"] if author else None),
                f"author_repository_count{timestamp_suffix}": (
                    author["repositories"]["totalCount"]
                    if author and author["repositories"]
                    else None
                ),
                f"author_followers_count{timestamp_suffix}": (
                    author["followers"]["totalCount"]
                    if author and author["followers"]
                    else None
                ),
                f"author_following_count{timestamp_suffix}": (
                    author["following"]["totalCount"]
                    if author and author["following"]
                    else None
                ),
                f"author_repository_discussions_count{timestamp_suffix}": (
                    author["repositoryDiscussions"]["totalCount"]
                    if author and author["repositoryDiscussions"]
                    else None
                ),
                f"author_repository_discussion_comments_count{timestamp_suffix}": (
                    author["repositoryDiscussionComments"]["totalCount"]
                    if author and author["repositoryDiscussionComments"]
                    else None
                ),
                f"author_organizations{timestamp_suffix}": author_organizations,
            }
        )


execute_with_dynamic_date_range(
    repo_name=repository_name,
    execute_query=execute_query,
    process_results=result_processor,
    start_date_arg=repository_created_at,
     se_fm_repository_data=df,
)

In [None]:
import pandas as pd
import pickle

# Load the .pkl file
with open(pkl_filename, "rb") as file:
    data = pickle.load(file)

# Check if it's a list of dictionaries
if isinstance(data["df"], list) and all(isinstance(d, dict) for d in data["df"]):
    # Convert to DataFrame
    df = pd.DataFrame(data["df"])
    
    # Remove duplicates by 'id'
    df = df.drop_duplicates(subset="id", keep="first")

    # Convert back to a list of dictionaries
    cleaned_data = {**data, "df": df.to_dict(orient="records")}

# Check if it's already a DataFrame
elif isinstance(data["df"], pd.DataFrame):
    # Remove duplicates by 'id'
    cleaned_df = data["df"].drop_duplicates(subset="id", keep="first")
    cleaned_data = {**data, "df": cleaned_df}

# Save the cleaned data
with open(pkl_filename, "wb") as file:
    pickle.dump(cleaned_data, file)

log_activity("Duplicates removed and data saved successfully.")

In [None]:
import pickle
import pandas as pd


def save_pkl_content_as_csv_and_json(filepath):
    """
    This function reads a pickle file from the given filepath
    and saves the data contained in the "df" key to both a CSV file and a JSON file.
    The CSV and json files are saved.

    Args:
        filepath (str): The path to the pickle file to be read.

    Raises:
        Exception: If there is an error reading the pickle file or writing the CSV/JSON files.
    """
    try:
        with open(filepath, "rb") as f:
            data = pickle.load(f)

        filename = filepath.replace('.pkl', '')
        pd.DataFrame(data["df"]).to_csv(f"{filename}.csv", index=True)
        log_activity(f"Data written to {filename}.csv successfully.")
    except Exception as e:
        log_activity(f"An unexpected error occurred: {e}")


filepath = pkl_filename
save_pkl_content_as_csv_and_json(filepath)

In [None]:
# generate metadata
df = []
start_index = 0


def generate_metadata(filepath):
    """ """
    try:
        with open(filepath, "rb") as f:
            data = pickle.load(f)

        filename = filepath.replace(".pkl", "") + ".meta"
        pull_requests = data["df"]
        unique_pr_author = {}
        merged_pr_count = 0
        closed_pr_count = 0
        for pull_request in pull_requests:

            def update_unique_value_dict(info_dict, key, value):
                if not value:
                    log_activity(
                        f"Warning: Pull request missing '{key}' {pull_request}"
                    )
                    return False
                if value not in info_dict:
                    info_dict[value] = value
                return True

            # Update author count
            update_unique_value_dict(
                unique_pr_author, "author_name", pull_request["author_name"]
            )

            merged_pr_count += 1 if pull_request["merged"] else 0
            closed_pr_count += 1 if pull_request["closed"] is not None else 0

        total_prs = len(pull_requests)
        unique_pr_author_count = len(unique_pr_author)

        df.append(
            {
                "total_prs": total_prs,
                "unique_pr_author_count": unique_pr_author_count,
                "unique_pr_author_ratio": (
                    round(unique_pr_author_count / total_prs, 3) if total_prs > 0 else 0
                ),
                "merged_pr_count": merged_pr_count,
                "merged_pr_ratio": (
                    round(merged_pr_count / total_prs, 3) if total_prs > 0 else 0
                ),
                "closed_pr_count": closed_pr_count,
                "closed_pr_ratio": (
                    round(closed_pr_count / total_prs, 3) if total_prs > 0 else 0
                ),
            }
        )

        pd.DataFrame(df).to_csv(f"{filename}.csv", index=True)
        log_activity(f"Data written to {filename}.csv successfully.")
        try:
            with open(f"{filename}.json", "w") as f:
                json.dump(df, f, indent=4)
            log_activity(f"Data written to {filename}.json successfully.")
        except Exception as e:
            log_activity(f"An error occurred: {e}")
    except Exception as e:
        log_activity(f"An unexpected error occurred: {e}")


filepath = pkl_filename
generate_metadata(filepath)