In [57]:
import requests
import json
import itertools
import os
from datetime import datetime
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import pandas as pd
import random
from tqdm import tqdm
import time

# Read tokens from a text file
tokens_file = "./env/tokens.txt"
with open(tokens_file, "r") as file:
    tokens = file.read().splitlines()

# Create an iterator to cycle through the tokens
token_iterator = itertools.cycle(tokens)
current_token = next(token_iterator)

In [58]:
# List of User-Agents for randomization
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

# Define headers to authenticate using the first token
headers = {
    "Authorization": f"Bearer {current_token}",
    "User-Agent": random.choice(user_agents),
}

# Setup GraphQL endpoint and client
graphql_url = "https://api.github.com/graphql"
transport = RequestsHTTPTransport(url=graphql_url, headers=headers, use_json=True)
client = Client(transport=transport, fetch_schema_from_transport=True)

In [None]:
# Test all tokens to verify their validity
def test_all_tokens():
    test_query = gql(
        """
        {
          viewer {
            login
          }
        }
        """
    )
    for i, token in enumerate(tokens):
        headers = {
            "Authorization": f"Bearer {token}",
            "User-Agent": random.choice(user_agents),
        }
        transport = RequestsHTTPTransport(
            url=graphql_url, headers=headers, use_json=True
        )
        client = Client(transport=transport, fetch_schema_from_transport=True)

        try:
            response = client.execute(test_query)
            print(
                f"Token {i+1}/{len(tokens)} is valid. Logged in as: {response['viewer']['login']}"
            )
        except Exception as e:
            print(f"Token {i+1}/{len(tokens)} failed with error: {e}")


# Run the token validation
test_all_tokens()

In [60]:
# Define the GraphQL query
query_template = gql(
    """
      query($username: String!, $start: DateTime!, $end: DateTime!) {
        user(login: $username) {
          url
          contributionsCollection(from: $start, to: $end) {
            contributionCalendar {
              totalContributions
            }
            totalCommitContributions
            totalIssueContributions
            totalPullRequestContributions
            totalPullRequestReviewContributions
            totalRepositoriesWithContributedCommits
            totalRepositoriesWithContributedIssues
            totalRepositoriesWithContributedPullRequestReviews
            totalRepositoriesWithContributedPullRequests
            totalRepositoryContributions
            restrictedContributionsCount
          }
        }
      }
    """
)

In [None]:
transport.headers = headers
# Check rate limit before executing the main query
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)
rate_limit_response = client.execute(rate_limit_query)
print(f"Rate limit: {rate_limit_response['rateLimit']}")

In [62]:
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)


def execute_query(username, start=100, end=None):
    global current_token
    print(
        f"Executing query with username: {username}, start: {start}, afterCursor: {end}"
    )
    while True:
        try:
            # Randomize User-Agent for each query
            headers["User-Agent"] = random.choice(user_agents)
            transport.headers = headers
            # Check rate limit before executing the main query
            rate_limit_response = client.execute(rate_limit_query)
            remaining = rate_limit_response["rateLimit"]["remaining"]
            if remaining < 100:
                print(
                    f"Rate limit remaining ({remaining}) is below threshold. Switching token..."
                )
                # Set up to track whether we have cycled through all tokens
                all_tokens_checked = False
                initial_token = current_token

                while not all_tokens_checked:
                    # Switch to the next token
                    current_token = next(token_iterator)
                    headers["Authorization"] = f"Bearer {current_token}"
                    transport.headers = headers

                    # Check the rate limit of the new token
                    rate_limit_response = client.execute(rate_limit_query)
                    remaining = rate_limit_response["rateLimit"]["remaining"]

                    if remaining >= 100:
                        print(
                            f"Switched to a new token with sufficient rate limit ({remaining} remaining)."
                        )
                        break

                    # Check if we have cycled through all tokens
                    if current_token == initial_token:
                        print("All tokens are below threshold. Waiting for 1 hour...")
                        time.sleep(3600)
                        all_tokens_checked = True

                continue
            return client.execute(
                query_template,
                variable_values={
                    "username": username,
                    "start": start,
                    "end": end,
                },
            )
        except Exception as e:
            if "API rate limit" in str(e):
                print(
                    f"Rate limit reached: {e}, switching token... (Attempt with start {start} and end {end})"
                )
                current_token = next(token_iterator)
                headers["Authorization"] = f"Bearer {current_token}"
            else:
                print(f"Error: {e}, stopped during (Attempt with start {start} and end {end})")
                break
                # if first > 1:
                #     first = max(1, first // 2)
                #     print(
                #         f"Error: {e}, reducing number of results and retrying... (Attempt with first {first})"
                #     )
                # else:
                #     break
    print("Max retries reached. Sleeping for 60 minutes and switching token...")
    time.sleep(3600)
    current_token = next(token_iterator)
    headers["Authorization"] = f"Bearer {current_token}"
    transport.headers = headers
    return execute_query(username, start, end)

In [71]:
import pickle

if os.path.exists("seg_user_contribution_count_progress.pkl"):
    with open("seg_user_contribution_count_progress.pkl", "rb") as f:
        progress_data = pickle.load(f)
        df = progress_data["df"]
        start_index = progress_data["start_index"]
else:
    df = []
    start_index = 0

In [72]:
import datetime
def get_datetime_str(date: str):
    return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")

def get_serializable_date_str(date: datetime.datetime):
    try:
        return date.strftime("%Y-%m-%dT%H:%M:%SZ")
    except Exception as e:
        print(f"Error: {e},\n date:{date}")
        return None

In [73]:
import pickle

df = []
start_index = 0

index = start_index
user_contribution_data = df
current_date = datetime.datetime.now()
all_contributions = {}
with open("progress.pkl", "rb") as f:
    data = pickle.load(f)
    spam_prs = data["df"]
    for pr in spam_prs:
        pr_id = pr.get("id")
        username = pr.get("author_name")

        author_account_created_at = get_datetime_str(
            pr.get("author_account_created_at")
        )
        pr_created_at = get_datetime_str(pr.get("created_at"))
        pr_labeled_spam_at = get_datetime_str(pr.get("labeled_spam_at"))

        if not username or not pr_id:
            continue

        user_query_start_date = author_account_created_at
        is_contribution_before_spam_label = True

        # counts before spam label
        total_contributions_count_before_spam_label = 0
        total_commit_contributions_before_spam_label = 0
        total_issue_contributions_before_spam_label = 0
        total_pull_request_contributions_before_spam_label = 0
        total_pull_request_review_contributions_before_spam_label = 0
        total_repositories_with_contributed_commits_before_spam_label = 0
        total_repositories_with_contributed_issues_before_spam_label = 0
        total_repositories_with_contributed_pull_request_reviews_before_spam_label = 0
        total_repositories_with_contributed_pull_requests_before_spam_label = 0
        total_repository_contributions_before_spam_label = 0
        restricted_contributions_count_before_spam_label = 0

        # counts after spam label
        total_contributions_count_after_spam_label = 0
        total_commit_contributions_after_spam_label = 0
        total_issue_contributions_after_spam_label = 0
        total_pull_request_contributions_after_spam_label = 0
        total_pull_request_review_contributions_after_spam_label = 0
        total_repositories_with_contributed_commits_after_spam_label = 0
        total_repositories_with_contributed_issues_after_spam_label = 0
        total_repositories_with_contributed_pull_request_reviews_after_spam_label = 0
        total_repositories_with_contributed_pull_requests_after_spam_label = 0
        total_repository_contributions_after_spam_label = 0
        restricted_contributions_count_after_spam_label = 0
        while True:
            one_year_from_start_date = user_query_start_date + datetime.timedelta(
                days=365
            )

            if not is_contribution_before_spam_label:
                user_query_end_date = min(one_year_from_start_date, current_date)
            else:
                user_query_end_date = min(one_year_from_start_date, pr_labeled_spam_at)

            try:
                response = execute_query(
                    username,
                    start=get_serializable_date_str(user_query_start_date),
                    end=get_serializable_date_str(user_query_end_date),
                )
                contributionsCollection = response["user"]["contributionsCollection"]

                contributions_count = contributionsCollection["contributionCalendar"][
                    "totalContributions"
                ]
                total_commit_contributions = contributionsCollection[
                    "totalCommitContributions"
                ]
                total_issue_contributions = contributionsCollection[
                    "totalIssueContributions"
                ]
                total_pull_request_contributions = contributionsCollection[
                    "totalPullRequestContributions"
                ]
                total_pull_request_review_contributions = contributionsCollection[
                    "totalPullRequestReviewContributions"
                ]
                total_repositories_with_contributed_commits = contributionsCollection[
                    "totalRepositoriesWithContributedCommits"
                ]
                total_repositories_with_contributed_issues = contributionsCollection[
                    "totalRepositoriesWithContributedIssues"
                ]
                total_repositories_with_contributed_pull_request_reviews = (
                    contributionsCollection[
                        "totalRepositoriesWithContributedPullRequestReviews"
                    ]
                )
                total_repositories_with_contributed_pull_requests = (
                    contributionsCollection[
                        "totalRepositoriesWithContributedPullRequests"
                    ]
                )
                total_repository_contributions = contributionsCollection[
                    "totalRepositoryContributions"
                ]
                restricted_contributions_count = contributionsCollection[
                    "restrictedContributionsCount"
                ]
                if is_contribution_before_spam_label:
                    total_contributions_count_before_spam_label += contributions_count
                    total_commit_contributions_before_spam_label += (
                        total_commit_contributions
                    )
                    total_issue_contributions_before_spam_label += (
                        total_issue_contributions
                    )
                    total_pull_request_contributions_before_spam_label += (
                        total_pull_request_contributions
                    )
                    total_pull_request_review_contributions_before_spam_label += (
                        total_pull_request_review_contributions
                    )
                    total_repositories_with_contributed_commits_before_spam_label += (
                        total_repositories_with_contributed_commits
                    )
                    total_repositories_with_contributed_issues_before_spam_label += (
                        total_repositories_with_contributed_issues
                    )
                    total_repositories_with_contributed_pull_request_reviews_before_spam_label += (
                        total_repositories_with_contributed_pull_request_reviews
                    )
                    total_repositories_with_contributed_pull_requests_before_spam_label += (
                        total_repositories_with_contributed_pull_requests
                    )
                    total_repository_contributions_before_spam_label += (
                        total_repository_contributions
                    )
                    restricted_contributions_count_before_spam_label += (
                        restricted_contributions_count
                    )
                else:
                    total_contributions_count_after_spam_label += contributions_count
                    total_commit_contributions_after_spam_label += (
                        total_commit_contributions
                    )
                    total_issue_contributions_after_spam_label += (
                        total_issue_contributions
                    )
                    total_pull_request_contributions_after_spam_label += (
                        total_pull_request_contributions
                    )
                    total_pull_request_review_contributions_after_spam_label += (
                        total_pull_request_review_contributions
                    )
                    total_repositories_with_contributed_commits_after_spam_label += (
                        total_repositories_with_contributed_commits
                    )
                    total_repositories_with_contributed_issues_after_spam_label += (
                        total_repositories_with_contributed_issues
                    )
                    total_repositories_with_contributed_pull_request_reviews_after_spam_label += (
                        total_repositories_with_contributed_pull_request_reviews
                    )
                    total_repositories_with_contributed_pull_requests_after_spam_label += (
                        total_repositories_with_contributed_pull_requests
                    )
                    total_repository_contributions_after_spam_label += (
                        total_repository_contributions
                    )
                    restricted_contributions_count_after_spam_label += (
                        restricted_contributions_count
                    )

            except Exception as e:
                # Save usercontrib-progress before terminating
                with open("user_contribution_count_progress.pkl", "wb") as f:
                    pickle.dump({"df": df, "start_index": index}, f)
                print(f"Error querying contributions for {username}: {e}")
                break

            if (
                user_query_end_date >= pr_labeled_spam_at
                and user_query_end_date >= current_date
            ):
                break

            if user_query_end_date == pr_labeled_spam_at:
                is_contribution_before_spam_label = False
            user_query_start_date = user_query_end_date + datetime.timedelta(seconds=1)
        df.append(
            {
                "username": username,
                "user_query_start_date": get_serializable_date_str(
                    user_query_start_date
                ),
                "user_query_end_date": get_serializable_date_str(
                    user_query_end_date
                ),
                "pr_labeled_spam_at": get_serializable_date_str(pr_labeled_spam_at),
                "is_contribution_before_spam_label": is_contribution_before_spam_label,
                "url": response["user"]["url"],
                "total_contributions_count_before_spam_label": total_contributions_count_before_spam_label
                ,
                "total_commit_contributions_before_spam_label": total_commit_contributions_before_spam_label,
                "total_issue_contributions_before_spam_label": total_issue_contributions_before_spam_label,
                "total_pull_request_contributions_before_spam_label": total_pull_request_contributions_before_spam_label,
                "total_pull_request_review_contributions_before_spam_label": total_pull_request_review_contributions_before_spam_label,
                "total_repositories_with_contributed_commits_before_spam_label": total_repositories_with_contributed_commits_before_spam_label,
                "total_repositories_with_contributed_issues_before_spam_label": total_repositories_with_contributed_issues_before_spam_label,
                "total_repositories_with_contributed_pull_request_reviews_before_spam_label": total_repositories_with_contributed_pull_request_reviews_before_spam_label,
                "total_repositories_with_contributed_pull_requests_before_spam_label": total_repositories_with_contributed_pull_requests_before_spam_label,
                "total_repository_contributions_before_spam_label": total_repository_contributions_before_spam_label,
                "restricted_contributions_count_before_spam_label": restricted_contributions_count_before_spam_label,
                "total_contributions_count_after_spam_label": total_contributions_count_after_spam_label,
                "total_commit_contributions_after_spam_label": total_commit_contributions_after_spam_label,
                "total_issue_contributions_after_spam_label": total_issue_contributions_after_spam_label,
                "total_pull_request_contributions_after_spam_label": total_pull_request_contributions_after_spam_label,
                "total_pull_request_review_contributions_after_spam_label": total_pull_request_review_contributions_after_spam_label,
                "total_repositories_with_contributed_commits_after_spam_label": total_repositories_with_contributed_commits_after_spam_label,
                "total_repositories_with_contributed_issues_after_spam_label": total_repositories_with_contributed_issues_after_spam_label,
                "total_repositories_with_contributed_pull_request_reviews_after_spam_label": total_repositories_with_contributed_pull_request_reviews_after_spam_label,
                "total_repositories_with_contributed_pull_requests_after_spam_label": total_repositories_with_contributed_pull_requests_after_spam_label,
                "total_repository_contributions_after_spam_label": total_repository_contributions_after_spam_label,
                "restricted_contributions_count_after_spam_label": restricted_contributions_count_after_spam_label
            }
        )
        # Extract contributions
        with open("seg_user_contribution_count_progress.pkl", "wb") as f:
            pickle.dump({"df": user_contribution_data, "start_index": index + 1}, f)

Executing query with username: luisAzcuaga, start: 2018-03-22T01:25:11Z, afterCursor: 2019-03-22T01:25:11Z
Executing query with username: luisAzcuaga, start: 2019-03-22T01:25:12Z, afterCursor: 2020-03-21T01:25:12Z
Executing query with username: luisAzcuaga, start: 2020-03-21T01:25:13Z, afterCursor: 2021-03-21T01:25:13Z
Executing query with username: luisAzcuaga, start: 2021-03-21T01:25:14Z, afterCursor: 2021-10-27T08:29:55Z
Executing query with username: luisAzcuaga, start: 2021-10-27T08:29:56Z, afterCursor: 2022-10-27T08:29:56Z
Executing query with username: luisAzcuaga, start: 2022-10-27T08:29:57Z, afterCursor: 2023-10-27T08:29:57Z
Executing query with username: luisAzcuaga, start: 2023-10-27T08:29:58Z, afterCursor: 2024-10-26T08:29:58Z
Executing query with username: luisAzcuaga, start: 2024-10-26T08:29:59Z, afterCursor: 2025-02-03T16:09:35Z
Executing query with username: EbhomenyeEmmanuel, start: 2018-08-11T15:08:57Z, afterCursor: 2019-08-11T15:08:57Z
Executing query with username: 

In [74]:
from fileinput import filename
import pickle
import pandas as pd


def display_pkl_content(filepath):
    """
    Display the content of a pickle file and save it as CSV and JSON.
    This function reads a pickle file from the given filepath, extracts the data,
    and saves it as both a CSV and a JSON file. The CSV file is saved with the
    filename 'seg_user_contribution_count.csv' and the JSON file is saved with the
    filename 'seg_user_contribution_count.json'.
    Parameters:
    filepath (str): The path to the pickle file.
    Raises:
    Exception: If an error occurs while reading the pickle file or writing the CSV/JSON files.
    Example:
    display_pkl_content('/path/to/your/file.pkl')
    """

    try:
        with open(filepath, "rb") as f:
            data = pickle.load(f)

        print(f"Content of {filepath}:\n")
        filename = "seg_user_contribution_count"
        pd.DataFrame(data["df"]).to_csv(f"{filename}.csv", index=True)
        print(f"Data written to {filename}.csv successfully.")
        try:
            with open(f"{filename}.json", "w") as f:
                json.dump(data["df"], f, indent=4)
            print(f"Data written to {filename}.json successfully.")
        except Exception as e:
            print(f"An error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")



# Example usage:
filepath = "seg_user_contribution_count_progress.pkl"  # Replace with the actual path to your .pkl file
display_pkl_content(filepath)

Content of seg_user_contribution_count_progress.pkl:

Data written to seg_user_contribution_count.csv successfully.
Data written to seg_user_contribution_count.json successfully.
