In [None]:
import json
import itertools
import os
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import pandas as pd
import random
from tqdm import tqdm
import time

# Read tokens from a text file
tokens_file = "./env/tokens.txt"
with open(tokens_file, "r") as file:
    tokens = file.read().splitlines()

# Create an iterator to cycle through the tokens
token_iterator = itertools.cycle(tokens)
current_token = next(token_iterator)

In [None]:
# List of User-Agents for randomization
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

# Define headers to authenticate using the first token
headers = {
    "Authorization": f"Bearer {current_token}",
    "User-Agent": random.choice(user_agents),
}

# Setup GraphQL endpoint and client
graphql_url = "https://api.github.com/graphql"
transport = RequestsHTTPTransport(url=graphql_url, headers=headers, use_json=True)
client = Client(transport=transport, fetch_schema_from_transport=True)

In [None]:
# Test all tokens to verify their validity
def test_all_tokens():
    test_query = gql(
        """
        {
          viewer {
            login
          }
        }
        """
    )
    for i, token in enumerate(tokens):
        headers = {
            "Authorization": f"Bearer {token}",
            "User-Agent": random.choice(user_agents),
        }
        transport = RequestsHTTPTransport(
            url=graphql_url, headers=headers, use_json=True
        )
        client = Client(transport=transport, fetch_schema_from_transport=True)

        try:
            response = client.execute(test_query)
            print(
                f"Token {i+1}/{len(tokens)} is valid. Logged in as: {response['viewer']['login']}"
            )
        except Exception as e:
            print(f"Token {i+1}/{len(tokens)} failed with error: {e}")


# Run the token validation
test_all_tokens()

In [173]:
# Define the GraphQL query
query_template = gql(
    """
    query searchIssues($keyword: String!, $afterCursor: String, $first: Int) {
      search(query: $keyword, type: ISSUE, first: 10) {
        issueCount
        edges {
          cursor
          node {
            ... on PullRequest {
              id
              number
              title
              url
              timeline(first: $first, after: $afterCursor) {
                edges {
                  node {
                    __typename
                    ... on Commit {
                      additions
                      author { ... on GitActor { user { login } } }
                      authoredByCommitter
                      authoredDate
                      changedFilesIfAvailable
                      commitUrl
                      committedDate
                      committedViaWeb
                      deletions
                      id
                      message
                      oid
                    }
                    ... on IssueComment {
                      author { ... on User { login } }
                      editor { ... on User { login } }
                      body
                      id
                      url
                      createdAt
                      lastEditedAt
                    }
                    ... on PullRequestReviewComment {
                      author { ... on User { login } }
                      editor { ... on User { login } }
                      body
                      id
                      url
                      commentState: state
                      createdAt
                      lastEditedAt
                    }
                    ... on ReviewRequestedEvent {
                      actor { ... on User { login } }
                      id
                      createdAt
                    }
                    ... on AssignedEvent {
                      actor { ... on User { login url } }
                      id
                      assignee {
                        ... on User {
                          login
                          avatarUrl
                          url
                        }
                        ... on Bot {
                          login
                          avatarUrl
                          url
                        }
                        ... on Mannequin {
                          login
                          avatarUrl
                          url
                        }
                      }
                      createdAt
                    }
                    ... on UnassignedEvent {
                      actor { ... on User { login url } }
                      id
                      assignee {
                        ... on User {
                          login
                          avatarUrl
                          url
                        }
                        ... on Bot {
                          login
                          avatarUrl
                          url
                        }
                        ... on Mannequin {
                          login
                          avatarUrl
                          url
                        }
                      }
                      createdAt
                    }
                    ... on LabeledEvent {
                      actor { ... on User { login url } }
                      id
                      label { ... on Label { name }}
                      createdAt
                    }
                    ... on UnlabeledEvent {
                      actor { ... on User { login url } }
                      id
                      label { ... on Label { name }}
                      createdAt
                    }
                    ... on ClosedEvent { 
                      actor { ... on User { login url } }
                      id
                      url
                      createdAt
                    }
                    ... on MergedEvent { 
                      actor { ... on User { login url } }
                      id
                      url
                      createdAt
                    }
                    ... on PullRequestReview {
                      author { ... on User { login } }
                      reviewState: state
                      resourcePath
                      createdAt
                      lastEditedAt
                      url
                      body
                      id
                    }
                    ... on RenamedTitleEvent{
                      actor { ... on User { login url } }
                      previousTitle
                      currentTitle
                      id
                      createdAt
                    }
                    ... on ReopenedEvent {
                      actor { ... on User { login } }
                      createdAt
                    }

                    ... on ReviewDismissedEvent {
                      actor { ... on User { login } }
                      createdAt
                      dismissalMessage
                      review {
                        author { ... on User { login } }
                        state
                        resourcePath
                        createdAt
                        lastEditedAt
                        url
                        body
                        id
                      }
                    }

                    ... on ReviewRequestRemovedEvent {
                      actor { ... on User { login } }
                      createdAt
                      requestedReviewer {  
                        ... on User {
                          login
                          avatarUrl
                          url
                        }
                        ... on Bot {
                          login
                          avatarUrl
                          url
                        }
                        ... on Mannequin {
                          login
                          avatarUrl
                          url
                        } 
                        ... on Team {
                          name
                          teamAvatarUrl: avatarUrl
                          url
                        } 
                      }
                    }

                    ... on SubscribedEvent {
                      actor { ... on User { login } }
                      createdAt
                    }

                    ... on UnlockedEvent {
                      actor { ... on User { login } }
                      createdAt
                    }

                    ... on UnsubscribedEvent {
                      actor { ... on User { login } }
                      createdAt
                    }

                    ... on UserBlockedEvent {
                      actor { ... on User { login } }
                      createdAt
                      blockDuration
                    }
                  }
                }
                pageInfo {
                  endCursor
                  hasNextPage
                }
              }
            }
          }
        }
        pageInfo {
          endCursor
          hasNextPage
        }
      }
    }
    """
)

In [174]:
# def get_contributor_count(repo_owner, repo_name):
#     global current_token
#     max_retries = 3
#     retries = 0
#     while retries < max_retries:
#         try:
#             # Randomize User-Agent for each query
#             headers["User-Agent"] = random.choice(user_agents)
#             headers["Authorization"] = f"Bearer {current_token}"
#             url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contributors?per_page=1&anon=true"
#             response = requests.get(url, headers=headers)
#             if response.status_code == 200:
#                 return int(response.headers.get("Link", "").split(",")[-1].split("&page=")[-1].split(">")[0]) if "Link" in response.headers else len(response.json())
#             elif response.status_code == 403:
#                 print(f"Rate limit exceeded, switching token... (Attempt {retries + 1}/{max_retries})")
#                 current_token = next(token_iterator)
#                 retries += 1
#             else:
#                 response.raise_for_status()
#         except Exception as e:
#             print(f"Error: {e}, retrying... (Attempt {retries + 1}/{max_retries})")
#             retries += 1
#     raise Exception("Max retries reached. Unable to complete the request.")

In [175]:
transport.headers = headers
# Check rate limit before executing the main query
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)
rate_limit_response = client.execute(rate_limit_query)
print(f"Rate limit: {rate_limit_response['rateLimit']}")

Rate limit: {'limit': 5000, 'remaining': 4953, 'used': 47, 'resetAt': '2025-02-12T10:05:49Z'}


In [176]:
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)


def execute_query(keyword, first=100, after_cursor=None):
    global current_token
    print(
        f"Executing query with keyword: {keyword}, first: {first}, afterCursor: {after_cursor}"
    )
    while True:
        try:
            # Randomize User-Agent for each query
            headers["User-Agent"] = random.choice(user_agents)
            transport.headers = headers
            # Check rate limit before executing the main query
            rate_limit_response = client.execute(rate_limit_query)
            remaining = rate_limit_response["rateLimit"]["remaining"]
            if remaining < 100:
                print(
                    f"Rate limit remaining ({remaining}) is below threshold. Switching token..."
                )
                # Set up to track whether we have cycled through all tokens
                all_tokens_checked = False
                initial_token = current_token

                while not all_tokens_checked:
                    # Switch to the next token
                    current_token = next(token_iterator)
                    headers["Authorization"] = f"Bearer {current_token}"
                    transport.headers = headers

                    # Check the rate limit of the new token
                    rate_limit_response = client.execute(rate_limit_query)
                    remaining = rate_limit_response["rateLimit"]["remaining"]

                    if remaining >= 100:
                        print(
                            f"Switched to a new token with sufficient rate limit ({remaining} remaining)."
                        )
                        break

                    # Check if we have cycled through all tokens
                    if current_token == initial_token:
                        print("All tokens are below threshold. Waiting for 1 hour...")
                        time.sleep(3600)
                        all_tokens_checked = True

                continue
            return client.execute(
                query_template,
                variable_values={
                    "keyword": keyword,
                    "first": first,
                    "afterCursor": after_cursor,
                },
            )
        except Exception as e:
            if "API rate limit" in str(e):
                print(
                    f"Rate limit reached: {e}, switching token... (Attempt with first {first})"
                )
                current_token = next(token_iterator)
                headers["Authorization"] = f"Bearer {current_token}"
            else:
                if first > 1:
                    first = max(1, first // 2)
                    print(
                        f"Error: {e}, reducing number of results and retrying... (Attempt with first {first})"
                    )
                else:
                    break
    print("Max retries reached. Sleeping for 60 minutes and switching token...")
    time.sleep(3600)
    current_token = next(token_iterator)
    headers["Authorization"] = f"Bearer {current_token}"
    transport.headers = headers
    return execute_query(keyword, first, after_cursor)

In [177]:
import pickle

if os.path.exists("pr_timeline_progress.pkl"):
    with open("pr_timeline_progress.pkl", "rb") as f:
        progress_data = pickle.load(f)
        df = progress_data["df"]
        start_index = progress_data["start_index"]
else:
    df = []
    start_index = 0

In [178]:
df = []
start_index = 0

In [None]:
with open("progress.pkl", "rb") as f:
    data = pickle.load(f)
    spam_prs = data["df"]

index = start_index
mining_data = df

for spam_pr in spam_prs:
    spam_pr_id = spam_pr.get("id")
    title = spam_pr.get("title")
    repository_name_with_owner = spam_pr.get("repository_name_with_owner")

    search_keyword = f"label:spam {title} in:title is:pr is:public archived:false repo:{repository_name_with_owner}"

    pr_timeline = []
    if not title or not spam_pr_id:
        continue
    try:
        after_cursor = None
        while True:
            response = execute_query(
                search_keyword, first=10, after_cursor=after_cursor
            )
            if response["search"]["issueCount"] == 0:
                break
            first_edge = response["search"]["edges"][0]
            for edge in response["search"]["edges"]:
                pull_request = edge["node"]

                if not pull_request:
                    continue
                timeline = [
                    comment["node"] for comment in pull_request["timeline"]["edges"]
                ]

                pr_timeline.extend(timeline)

            # Pagination
            if not first_edge or (first_edge and not first_edge["node"]):
                break
            page_info = first_edge["node"]["timeline"]["pageInfo"]
            if page_info["hasNextPage"]:
                after_cursor = page_info["endCursor"]
            else:
                break
        df.append(
            {
                "id": spam_pr_id,
                "url": spam_pr["url"],
                "title": spam_pr["title"],
                "timeline_count": len(pr_timeline),
                "timeline": pr_timeline,
            }
        )

        with open("pr_timeline_progress.pkl", "wb") as f:
            pickle.dump({"df": mining_data, "start_index": index + 1}, f)

    except Exception as e:
        print(f"Failed to retrieve data for pr '{title}': {e}")
        # Save progress before terminating
        with open("pr_timeline_progress.pkl", "wb") as f:
            pickle.dump({"df": df, "start_index": index}, f)
        raise

Executing query with keyword: label:spam Added Spider Man Game in:title is:pr is:public archived:false repo:pranjay-poddar/Dev-Geeks, first: 10, afterCursor: None
Executing query with keyword: label:spam Added Spider Man Game in:title is:pr is:public archived:false repo:pranjay-poddar/Dev-Geeks, first: 10, afterCursor: MTA
Executing query with keyword: label:spam Added Spider Man Game in:title is:pr is:public archived:false repo:pranjay-poddar/Dev-Geeks, first: 10, afterCursor: MjA
Executing query with keyword: label:spam Added Spider Man Game in:title is:pr is:public archived:false repo:pranjay-poddar/Dev-Geeks, first: 10, afterCursor: MzA
Executing query with keyword: label:spam Added Spider Man Game in:title is:pr is:public archived:false repo:pranjay-poddar/Dev-Geeks, first: 10, afterCursor: NDA
Executing query with keyword: label:spam Added Spider Man Game in:title is:pr is:public archived:false repo:pranjay-poddar/Dev-Geeks, first: 10, afterCursor: NTA


In [180]:
import pickle
import pandas as pd


def display_pkl_content(filepath):
    """
    Display the content of a pickle file and save it as CSV and JSON files.

    This function reads a pickle file from the given filepath, prints its content,
    and saves the data contained in the "df" key to both a CSV file and a JSON file.
    The CSV file is saved with the name "pr_timeline.csv" and the JSON file is saved
    with the name "pr_timeline.json".

    Args:
        filepath (str): The path to the pickle file to be read.

    Raises:
        Exception: If there is an error reading the pickle file or writing the CSV/JSON files.
    """
    try:
        with open(filepath, "rb") as f:
            data = pickle.load(f)

        print(f"Content of {filepath}:\n")
        filename = "pr_timeline"
        pd.DataFrame(data["df"]).to_csv(f"{filename}.csv", index=True)
        print(f"Data written to {filename}.csv successfully.")
        try:
            with open(f"{filename}.json", "w") as f:
                json.dump(data["df"], f, indent=4)
            print(f"Data written to {filename}.json successfully.")
        except Exception as e:
            print(f"An error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


filepath = "pr_timeline_progress.pkl"
display_pkl_content(filepath)

Content of pr_timeline_progress.pkl:

Data written to pr_timeline.csv successfully.
Data written to pr_timeline.json successfully.
