In [11]:
import requests
import json
import itertools
import os
from datetime import datetime
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import pandas as pd
import random
from tqdm import tqdm
import time

# Read tokens from a text file
tokens_file = "./env/tokens.txt"
with open(tokens_file, "r") as file:
    tokens = file.read().splitlines()

# Create an iterator to cycle through the tokens
token_iterator = itertools.cycle(tokens)
current_token = next(token_iterator)

In [12]:
# List of User-Agents for randomization
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

# Define headers to authenticate using the first token
headers = {
    "Authorization": f"Bearer {current_token}",
    "User-Agent": random.choice(user_agents),
}

# Setup GraphQL endpoint and client
graphql_url = "https://api.github.com/graphql"
transport = RequestsHTTPTransport(url=graphql_url, headers=headers, use_json=True)
client = Client(transport=transport, fetch_schema_from_transport=True)

In [13]:
# Test all tokens to verify their validity
def test_all_tokens():
    test_query = gql(
        """
        {
          viewer {
            login
          }
        }
        """
    )
    for i, token in enumerate(tokens):
        headers = {
            "Authorization": f"Bearer {token}",
            "User-Agent": random.choice(user_agents),
        }
        transport = RequestsHTTPTransport(
            url=graphql_url, headers=headers, use_json=True
        )
        client = Client(transport=transport, fetch_schema_from_transport=True)

        try:
            response = client.execute(test_query)
            print(
                f"Token {i+1}/{len(tokens)} is valid. Logged in as: {response['viewer']['login']}"
            )
        except Exception as e:
            print(f"Token {i+1}/{len(tokens)} failed with error: {e}")


# Run the token validation
test_all_tokens()

Token 1/3 is valid. Logged in as: JosephAyo
Token 2/3 is valid. Logged in as: JosephAyo
Token 3/3 is valid. Logged in as: AY-BAMZ


In [14]:
# Define the GraphQL query
query_template = gql(
    """
    query searchIssues($keyword: String!, $afterCursor: String, $first: Int) {
      search(query: $keyword, type: ISSUE, first: $first, after: $afterCursor) {
        issueCount
        edges {
          cursor
          node {
            ... on PullRequest {
              id
              number
              title
              url
              comments(first: 100) {
                totalCount # it still gives the total count regardless of the first parameter
                edges {
                  node {   
                    author { ... on User { login } }
                    editor { ... on User { login } }
                    body
                    createdAt
                    lastEditedAt
                  }
                }
              }
              state
              closed
              merged
              createdAt
              updatedAt
              mergeCommit {
                oid
              }
              timeline(last: 100) {
                edges {
                  node {
                    __typename
                    ... on LabeledEvent {
                      actor { ... on User { login } }
                      label { ... on Label { name }}
                      createdAt
                    }
                    ... on ClosedEvent { 
                      actor { ... on User { login } }
                      createdAt
                    }
                  }
                }
              }
              commits(first: 100) {
                totalCount
                pageInfo {
                  hasNextPage
                  endCursor
                }
                edges {
                  node {
                    commit {
                      oid
                      message
                      author { ... on GitActor { name } }
                      changedFilesIfAvailable
                      commitUrl
                      committedDate
                    }
                  }
                }
              }
              changedFiles
              headRefName
              baseRefName
              repository {
                nameWithOwner
                stargazerCount
                watchers {
                  totalCount
                }
                isFork
                languages(first: 20) {
                  edges {
                    node {
                      name
                    }
                  }
                }
              }
              author {
                ... on User {
                  login
                  url
                  createdAt
                  repositories {
                    totalCount
                  }
                  followers {
                    totalCount
                  }
                  following {
                    totalCount
                  }
                  repositoryDiscussions {
                    totalCount
                  }
                  repositoryDiscussionComments {
                    totalCount
                  }
                  organizations (first: 20){
                    edges {
                      node {
                        name
                        login
                        url
                        membersWithRole {
                          totalCount
                        }
                      }
                    }
                  }
                }
              }
              labels(first: 10) {
                edges {
                  node {
                    name
                  }
                }
              }
              body
            }
          }
        }
        pageInfo {
          endCursor
          hasNextPage
        }
      }
    }
    """
)

In [15]:
# def get_contributor_count(repo_owner, repo_name):
#     global current_token
#     max_retries = 3
#     retries = 0
#     while retries < max_retries:
#         try:
#             # Randomize User-Agent for each query
#             headers["User-Agent"] = random.choice(user_agents)
#             headers["Authorization"] = f"Bearer {current_token}"
#             url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contributors?per_page=1&anon=true"
#             response = requests.get(url, headers=headers)
#             if response.status_code == 200:
#                 return int(response.headers.get("Link", "").split(",")[-1].split("&page=")[-1].split(">")[0]) if "Link" in response.headers else len(response.json())
#             elif response.status_code == 403:
#                 print(f"Rate limit exceeded, switching token... (Attempt {retries + 1}/{max_retries})")
#                 current_token = next(token_iterator)
#                 retries += 1
#             else:
#                 response.raise_for_status()
#         except Exception as e:
#             print(f"Error: {e}, retrying... (Attempt {retries + 1}/{max_retries})")
#             retries += 1
#     raise Exception("Max retries reached. Unable to complete the request.")

In [16]:
transport.headers = headers
# Check rate limit before executing the main query
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)
rate_limit_response = client.execute(rate_limit_query)
print(f"Rate limit: {rate_limit_response['rateLimit']}")

Rate limit: {'limit': 5000, 'remaining': 4990, 'used': 10, 'resetAt': '2025-02-07T09:50:57Z'}


In [17]:
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)


def execute_query(keyword, first=100, after_cursor=None):
    global current_token
    print(
        f"Executing query with keyword: {keyword}, first: {first}, afterCursor: {after_cursor}"
    )
    while True:
        try:
            # Randomize User-Agent for each query
            headers["User-Agent"] = random.choice(user_agents)
            transport.headers = headers
            # Check rate limit before executing the main query
            rate_limit_response = client.execute(rate_limit_query)
            remaining = rate_limit_response["rateLimit"]["remaining"]
            if remaining < 100:
                print(
                    f"Rate limit remaining ({remaining}) is below threshold. Switching token..."
                )
                # Set up to track whether we have cycled through all tokens
                all_tokens_checked = False
                initial_token = current_token

                while not all_tokens_checked:
                    # Switch to the next token
                    current_token = next(token_iterator)
                    headers["Authorization"] = f"Bearer {current_token}"
                    transport.headers = headers

                    # Check the rate limit of the new token
                    rate_limit_response = client.execute(rate_limit_query)
                    remaining = rate_limit_response["rateLimit"]["remaining"]

                    if remaining >= 100:
                        print(
                            f"Switched to a new token with sufficient rate limit ({remaining} remaining)."
                        )
                        break

                    # Check if we have cycled through all tokens
                    if current_token == initial_token:
                        print("All tokens are below threshold. Waiting for 1 hour...")
                        time.sleep(3600)
                        all_tokens_checked = True

                continue
            return client.execute(
                query_template,
                variable_values={
                    "keyword": keyword,
                    "first": first,
                    "afterCursor": after_cursor,
                },
            )
        except Exception as e:
            if "API rate limit" in str(e):
                print(
                    f"Rate limit reached: {e}, switching token... (Attempt with first {first})"
                )
                current_token = next(token_iterator)
                headers["Authorization"] = f"Bearer {current_token}"
            else:
                if first > 1:
                    first = max(1, first // 2)
                    print(
                        f"Error: {e}, reducing number of results and retrying... (Attempt with first {first})"
                    )
                else:
                    break
    print("Max retries reached. Sleeping for 60 minutes and switching token...")
    time.sleep(3600)
    current_token = next(token_iterator)
    headers["Authorization"] = f"Bearer {current_token}"
    transport.headers = headers
    return execute_query(keyword, first, after_cursor)

In [36]:
import pickle

if os.path.exists("non-spam-progress.pkl"):
    with open("non-spam-progress.pkl", "rb") as f:
        progress_data = pickle.load(f)
        df = progress_data["df"]
        start_index = progress_data["start_index"]
else:
    df = []
    start_index = 0

In [94]:
df = []
start_index = 0

In [95]:
import datetime
from os import close
import pickle

with open("progress.pkl", "rb") as f:
    data = pickle.load(f)
    spam_prs = data["df"][1:2]


# template
index = start_index
se_fm_repository_data = df
for pull_request in spam_prs:
    repository_name_with_owner = pull_request.get("repository_name_with_owner")
    # created_at = pull_request.get("created_at")
    # closed_at = pull_request.get("closed_at")

    created_at = "2024-01-01"
    closed_at = "2025-01-01"
    if not repository_name_with_owner or not created_at or not closed_at:
        continue
    search_keyword = f"-label:spam repo:{repository_name_with_owner} is:pr is:public comments:>2 archived:false created:{created_at}..{closed_at}"
    # search_keyword = f"repository:{repository_name_with_owner} is:pr is:public comments:>2 archived:false created:{created_at}...{closed_at}"
    try:
        after_cursor = None
        while True:
            response = execute_query(
                search_keyword, first=10, after_cursor=after_cursor
            )

            print("response", response)
            if response["search"]["issueCount"] == 0:
                break
            # Extract pr
            for edge in response["search"]["edges"]:
                pull_request = edge["node"]

                if not pull_request:
                    continue
                timeline = pull_request["timeline"]["edges"]
                labeled_spam_event = next(
                    filter(
                        lambda x: x["node"]
                        and x["node"]["__typename"] == "LabeledEvent"
                        and x["node"]["label"]["name"]
                        and (x["node"]["label"]["name"]).lower() == "spam",
                        timeline,
                    ),
                    None,
                )
                labeled_spam_event_node = (
                    labeled_spam_event["node"] if labeled_spam_event else None
                )
                closed_event = next(
                    filter(
                        lambda x: x["node"]
                        and x["node"]["__typename"] == "ClosedEvent",
                        timeline,
                    ),
                    None,
                )
                closed_event_node = closed_event["node"] if closed_event else None
                author = pull_request["author"]
                comments = [
                    comment["node"] for comment in pull_request["comments"]["edges"]
                ]
                labeled_spam_at = (
                    labeled_spam_event_node["createdAt"]
                    if labeled_spam_event_node
                    else None
                )

                labeled_spam_by = (
                    labeled_spam_event_node["actor"]["login"]
                    if labeled_spam_event_node and labeled_spam_event_node["actor"]
                    else None
                )
                comments_by_spam_labeler = []

                closed_by = (
                    closed_event_node["actor"]["login"]
                    if closed_event_node and closed_event_node["actor"]
                    else None
                )
                closed_at = (
                    closed_event_node["createdAt"] if closed_event_node else None
                )
                comments_by_closer = [
                    {
                        **closer_comment,
                        "commented_before_closing": (
                            closer_comment["createdAt"] < closed_at
                            if closed_at
                            else False
                        ),
                    }
                    for closer_comment in comments
                    if (
                        (
                            closer_comment["author"]
                            and closer_comment["author"]["login"] == closed_by
                        )
                        or (not closer_comment["author"] and not closed_by)
                    )
                ]

                author_organizations = (
                    [
                        organization["node"]
                        for organization in author["organizations"]["edges"]
                        if organization["node"]
                    ]
                    if author
                    and author.get("organizations")
                    and author["organizations"].get("edges")
                    else []
                )

                timestamp_suffix = (
                    f"_as_at_{datetime.datetime.now().strftime('%Y-%m-%d')}"
                )

                df.append(
                    {
                        "id": pull_request["id"],
                        "title": pull_request["title"],
                        "url": pull_request["url"],
                        "state": pull_request["state"],
                        "comments_count": pull_request["comments"]["totalCount"],
                        "comments_by_spam_labeler_count": len(comments_by_spam_labeler),
                        "comments_by_spam_labeler": comments_by_spam_labeler,
                        "labeled_spam_by": (
                            labeled_spam_event_node["actor"]["login"]
                            if labeled_spam_event_node
                            and labeled_spam_event_node["actor"]
                            else None
                        ),
                        "is_labeled_spam_by_bot": False,
                        "labeled_spam_at": labeled_spam_at,
                        "comments_by_closer_count": len(comments_by_closer),
                        "comments_by_closer": comments_by_closer,
                        "closed": pull_request["closed"],
                        "is_closed_by_bot": closed_by is None,
                        "closed_by": closed_by,
                        "closed_at": closed_at,
                        "merged": pull_request["merged"],
                        "body": pull_request["body"],
                        "created_at": pull_request["createdAt"],
                        "updated_at": pull_request["updatedAt"],
                        "repository_name_with_owner": pull_request["repository"][
                            "nameWithOwner"
                        ],
                        "repository_stargazer_count": pull_request["repository"][
                            "stargazerCount"
                        ],
                        "repository_watcher_count": pull_request["repository"][
                            "watchers"
                        ]["totalCount"],
                        "repository_is_fork": pull_request["repository"]["isFork"],
                        "repository_languages": [
                            language["node"]["name"]
                            for language in pull_request["repository"]["languages"][
                                "edges"
                            ]
                        ],
                        "merge_commit": (
                            pull_request["mergeCommit"]["oid"]
                            if pull_request["mergeCommit"]
                            else None
                        ),
                        "labels": [
                            label["node"]["name"]
                            for label in pull_request["labels"]["edges"]
                        ],
                        "commits_count": pull_request["commits"]["totalCount"],
                        "changed_files_count": pull_request["changedFiles"],
                        "commits": pull_request["commits"]["edges"],
                        "author_name": (author["login"] if author else None),
                        "author_url": (author["url"] if author else None),
                        "author_account_created_at": (
                            author["createdAt"] if author else None
                        ),
                        f"author_repository_count{timestamp_suffix}": (
                            author["repositories"]["totalCount"]
                            if author and author["repositories"]
                            else None
                        ),
                        f"author_followers_count{timestamp_suffix}": (
                            author["followers"]["totalCount"]
                            if author and author["followers"]
                            else None
                        ),
                        f"author_following_count{timestamp_suffix}": (
                            author["following"]["totalCount"]
                            if author and author["following"]
                            else None
                        ),
                        f"author_repository_discussions_count{timestamp_suffix}": (
                            author["repositoryDiscussions"]["totalCount"]
                            if author and author["repositoryDiscussions"]
                            else None
                        ),
                        f"author_repository_discussion_comments_count{timestamp_suffix}": (
                            author["repositoryDiscussionComments"]["totalCount"]
                            if author and author["repositoryDiscussionComments"]
                            else None
                        ),
                        f"author_organizations{timestamp_suffix}": author_organizations,
                    }
                )

            # Pagination
            page_info = response["search"]["pageInfo"]
            if page_info["hasNextPage"]:
                after_cursor = page_info["endCursor"]
            else:
                break
        with open("non-spam-progress.pkl", "wb") as f:
            pickle.dump({"df": se_fm_repository_data, "start_index": index + 1}, f)

    except Exception as e:
        print(f"Failed to retrieve data for keywords '{keyword}': {e}")
        # Save progress before terminating
        with open("non-spam-progress.pkl", "wb") as f:
            pickle.dump({"df": df, "start_index": index}, f)
        raise

Executing query with keyword: -label:spam repo:jenkins-infra/plugin-modernizer-tool is:pr is:public comments:>2 archived:false created:2024-01-01..2025-01-01, first: 10, afterCursor: None
Executing query with keyword: -label:spam repo:jenkins-infra/plugin-modernizer-tool is:pr is:public comments:>2 archived:false created:2024-01-01..2025-01-01, first: 10, afterCursor: Y3Vyc29yOjEw
response {'search': {'issueCount': 55, 'edges': [{'cursor': 'Y3Vyc29yOjEx', 'node': {'id': 'PR_kwDOL2-Lu85_qApm', 'number': 355, 'title': 'feat(recipes): Add the FixJellyIssues recipe.', 'url': 'https://github.com/jenkins-infra/plugin-modernizer-tool/pull/355', 'comments': {'totalCount': 8, 'edges': [{'node': {'author': {'login': 'gounthar'}, 'editor': None, 'body': 'The recipe supposed to fix the missing `XML` declaration has been [released](https://github.com/openrewrite/rewrite-jenkins/releases/tag/v0.17.0), but it looks like it\'s doing nothing. 😢 \r\n```\r\njava -jar plugin-modernizer-cli/target/jenkins-

In [96]:
import pickle
import pandas as pd


def display_pkl_content(filepath):
    """
    Display the content of a pickle file and save it as CSV and JSON files.

    This function reads a pickle file from the given filepath, prints its content,
    and saves the data contained in the "df" key to both a CSV file and a JSON file.
    The CSV file is saved with the name "spam_data_without_org_join_date.csv" and the JSON file is saved
    with the name "spam_data_without_org_join_date.json".

    Args:
        filepath (str): The path to the pickle file to be read.

    Raises:
        Exception: If there is an error reading the pickle file or writing the CSV/JSON files.
    """
    try:
        with open(filepath, "rb") as f:
            data = pickle.load(f)

        print(f"Content of {filepath}:\n")
        filename = "non-spam_data_without_org_join_date"
        pd.DataFrame(data["df"]).to_csv(f"{filename}.csv", index=True)
        print(f"Data written to {filename}.csv successfully.")
        try:
            with open(f"{filename}.json", "w") as f:
                json.dump(data["df"], f, indent=4)
            print(f"Data written to {filename}.json successfully.")
        except Exception as e:
            print(f"An error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


filepath = "non-spam-progress.pkl"
display_pkl_content(filepath)

Content of non-spam-progress.pkl:

Data written to non-spam_data_without_org_join_date.csv successfully.
Data written to non-spam_data_without_org_join_date.json successfully.


In [97]:
# # generate metadata
# df = []
# start_index = 0



# def generate_metadata(filepath):
#     """ """
#     try:
#         with open(filepath, "rb") as f:
#             data = pickle.load(f)

#         print(f"Content of {filepath}:\n")
#         filename = "non-spam_data.meta"
#         pull_requests = data["df"]
#         unique_repository = {}
#         unique_pr_author = {}
#         unique_pr_spam_labeler = {}
#         unique_pr_closer = {}
#         merged_pr_count = 0
#         closed_pr_count = 0
#         for pull_request in pull_requests:
#             def update_unique_value_dict(info_dict, key, value):
#                 if not value:
#                     print(f"Warning: Pull request missing '{key}' {pull_request}")
#                     return False
#                 if value not in info_dict:
#                     info_dict[value] = value
#                 return True

#             # Update repository count
#             update_unique_value_dict(unique_repository, "repository_name_with_owner", pull_request["repository_name_with_owner"])

#             # Update author count
#             update_unique_value_dict(unique_pr_author, "author_name", pull_request["author_name"])

#             # Update spam labeler count
#             update_unique_value_dict(unique_pr_spam_labeler, "labeled_spam_by", pull_request["labeled_spam_by"])

#             # Update closer count
#             update_unique_value_dict(unique_pr_closer, "closed_by", pull_request["closed_by"])

#             merged_pr_count += 1 if pull_request["merged"] else 0
#             closed_pr_count += 1 if pull_request["closed"] is not None else 0


#         total_prs= len(pull_requests)
#         unique_repository_count= len(unique_repository)
#         unique_pr_author_count= len(unique_pr_author)
#         unique_pr_spam_labeler_count= len(unique_pr_spam_labeler)
#         unique_pr_closer_count= len(unique_pr_closer)
        
#         df.append(
#             {
#             "total_prs": total_prs,
#             "unique_repository_count": unique_repository_count,
#             "unique_repository_ratio": round(unique_repository_count / total_prs, 3),
#             "unique_pr_author_count": unique_pr_author_count,
#             "unique_pr_author_ratio": round(unique_pr_author_count / total_prs, 3),
#             "unique_pr_spam_labeler_count": unique_pr_spam_labeler_count,
#             "unique_pr_spam_labeler_ratio": round(unique_pr_spam_labeler_count / total_prs, 3),
#             "unique_pr_closer_count": unique_pr_closer_count,
#             "unique_pr_closer_ratio": round(unique_pr_closer_count / total_prs, 3),
#             "merged_pr_count": merged_pr_count,
#             "merged_pr_ratio": round(merged_pr_count / total_prs, 3),
#             "closed_pr_count": closed_pr_count,
#             "closed_pr_ratio": round(closed_pr_count / total_prs, 3),
#             }
#         )

#         pd.DataFrame(df).to_csv(f"{filename}.csv", index=True)
#         print(f"Data written to {filename}.csv successfully.")
#         try:
#             with open(f"{filename}.json", "w") as f:
#                 json.dump(df, f, indent=4)
#             print(f"Data written to {filename}.json successfully.")
#         except Exception as e:
#             print(f"An error occurred: {e}")
#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")


# filepath = "non-spam-progress.pkl"
# generate_metadata(filepath)