In [42]:
import requests
import json
import itertools
import os
from datetime import datetime
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import pandas as pd
import random
from tqdm import tqdm
import time

# Read tokens from a text file
tokens_file = "./env/tokens.txt"
with open(tokens_file, "r") as file:
    tokens = file.read().splitlines()

# Create an iterator to cycle through the tokens
token_iterator = itertools.cycle(tokens)
current_token = next(token_iterator)

In [43]:
# List of User-Agents for randomization
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

# Define headers to authenticate using the first token
headers = {
    "Authorization": f"Bearer {current_token}",
    "User-Agent": random.choice(user_agents),
}

# Setup GraphQL endpoint and client
graphql_url = "https://api.github.com/graphql"
transport = RequestsHTTPTransport(url=graphql_url, headers=headers, use_json=True)
client = Client(transport=transport, fetch_schema_from_transport=True)

In [None]:
# Test all tokens to verify their validity
def test_all_tokens():
    test_query = gql(
        """
        {
          viewer {
            login
          }
        }
        """
    )
    for i, token in enumerate(tokens):
        headers = {
            "Authorization": f"Bearer {token}",
            "User-Agent": random.choice(user_agents),
        }
        transport = RequestsHTTPTransport(
            url=graphql_url, headers=headers, use_json=True
        )
        client = Client(transport=transport, fetch_schema_from_transport=True)

        try:
            response = client.execute(test_query)
            print(
                f"Token {i+1}/{len(tokens)} is valid. Logged in as: {response['viewer']['login']}"
            )
        except Exception as e:
            print(f"Token {i+1}/{len(tokens)} failed with error: {e}")


# Run the token validation
test_all_tokens()

In [45]:
# Define the GraphQL query
query_template = gql(
    """
      query($username: String!, $start: DateTime!, $end: DateTime!) {
        user(login: $username) {
          url
          contributionsCollection(from: $start, to: $end) {
            contributionCalendar {
              totalContributions
            }
            totalCommitContributions
            totalIssueContributions
            totalPullRequestContributions
            totalPullRequestReviewContributions
            totalRepositoriesWithContributedCommits
            totalRepositoriesWithContributedIssues
            totalRepositoriesWithContributedPullRequestReviews
            totalRepositoriesWithContributedPullRequests
            totalRepositoryContributions
            restrictedContributionsCount
            # move this to a separate query and limit start and end dates to the 1 before and after when the 'spam' PR was submitted
            # commitContributionsByRepository {
            #   repository {
            #     name
            #     owner {
            #       login
            #     }
            #   }
            #   contributions {
            #     totalCount
            #   }
            # }
          }
        }
      }
    """
)

In [46]:
# def get_contributor_count(repo_owner, repo_name):
#     global current_token
#     max_retries = 3
#     retries = 0
#     while retries < max_retries:
#         try:
#             # Randomize User-Agent for each query
#             headers["User-Agent"] = random.choice(user_agents)
#             headers["Authorization"] = f"Bearer {current_token}"
#             url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contributors?per_page=1&anon=true"
#             response = requests.get(url, headers=headers)
#             if response.status_code == 200:
#                 return int(response.headers.get("Link", "").split(",")[-1].split("&page=")[-1].split(">")[0]) if "Link" in response.headers else len(response.json())
#             elif response.status_code == 403:
#                 print(f"Rate limit exceeded, switching token... (Attempt {retries + 1}/{max_retries})")
#                 current_token = next(token_iterator)
#                 retries += 1
#             else:
#                 response.raise_for_status()
#         except Exception as e:
#             print(f"Error: {e}, retrying... (Attempt {retries + 1}/{max_retries})")
#             retries += 1
#     raise Exception("Max retries reached. Unable to complete the request.")

In [None]:
transport.headers = headers
# Check rate limit before executing the main query
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)
rate_limit_response = client.execute(rate_limit_query)
print(f"Rate limit: {rate_limit_response['rateLimit']}")

In [48]:
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)


def execute_query(username, start=100, end=None):
    global current_token
    print(
        f"Executing query with username: {username}, start: {start}, afterCursor: {end}"
    )
    while True:
        try:
            # Randomize User-Agent for each query
            headers["User-Agent"] = random.choice(user_agents)
            transport.headers = headers
            # Check rate limit before executing the main query
            rate_limit_response = client.execute(rate_limit_query)
            remaining = rate_limit_response["rateLimit"]["remaining"]
            if remaining < 100:
                print(
                    f"Rate limit remaining ({remaining}) is below threshold. Switching token..."
                )
                # Set up to track whether we have cycled through all tokens
                all_tokens_checked = False
                initial_token = current_token

                while not all_tokens_checked:
                    # Switch to the next token
                    current_token = next(token_iterator)
                    headers["Authorization"] = f"Bearer {current_token}"
                    transport.headers = headers

                    # Check the rate limit of the new token
                    rate_limit_response = client.execute(rate_limit_query)
                    remaining = rate_limit_response["rateLimit"]["remaining"]

                    if remaining >= 100:
                        print(
                            f"Switched to a new token with sufficient rate limit ({remaining} remaining)."
                        )
                        break

                    # Check if we have cycled through all tokens
                    if current_token == initial_token:
                        print("All tokens are below threshold. Waiting for 1 hour...")
                        time.sleep(3600)
                        all_tokens_checked = True

                continue
            return client.execute(
                query_template,
                variable_values={
                    "username": username,
                    "start": start,
                    "end": end,
                },
            )
        except Exception as e:
            if "API rate limit" in str(e):
                print(
                    f"Rate limit reached: {e}, switching token... (Attempt with start {start} and end {end})"
                )
                current_token = next(token_iterator)
                headers["Authorization"] = f"Bearer {current_token}"
            else:
                print(f"Error: {e}, stopped during (Attempt with start {start} and end {end})")
                break
                # if first > 1:
                #     first = max(1, first // 2)
                #     print(
                #         f"Error: {e}, reducing number of results and retrying... (Attempt with first {first})"
                #     )
                # else:
                #     break
    print("Max retries reached. Sleeping for 60 minutes and switching token...")
    time.sleep(3600)
    current_token = next(token_iterator)
    headers["Authorization"] = f"Bearer {current_token}"
    transport.headers = headers
    return execute_query(username, start, end)

In [49]:
import pickle

if os.path.exists("user_contribution_count_progress.pkl"):
    with open("user_contribution_count_progress.pkl", "rb") as f:
        progress_data = pickle.load(f)
        df = progress_data["df"]
        start_index = progress_data["start_index"]
else:
    df = []
    start_index = 0

In [50]:
import datetime


def get_unique_authors_with_creation_time(pull_requests):
    """
    Extracts and returns a list of unique author information (name and earliest creation time)
    from a list of pull requests.

    Args:
        pull_requests: A list of dictionaries representing pull requests,
            each with 'author_name' and 'created_at' keys.

    Returns:
        A list of dictionaries, where each dictionary contains:
            - "author_name": The unique author name.
            - "first_created_at": The earliest creation time (datetime object) for that author.
        Returns an empty list if pull_requests is empty or None.
    """

    if not pull_requests:  # Handle empty or None input
        return []

    author_info = {}

    for pr in pull_requests:
        author_name = pr.get("author_name")
        created_at_str = pr.get("author_account_created_at")

        if not author_name or not created_at_str:
            print(f"Warning: Pull request missing 'author_name' or 'created_at': {pr}")
            continue  # Skip this PR if it's missing required data

        try:
            created_at = datetime.datetime.strptime(
                created_at_str, "%Y-%m-%dT%H:%M:%SZ"
            ).replace(tzinfo=datetime.timezone.utc)

        except ValueError:
            print(f"Warning: Invalid date format: {created_at_str} for PR: {pr}")
            continue

        if author_name not in author_info:
            author_info[author_name] = {
                "author_name": author_name,
                "first_created_at": created_at,
            }
        else:
            if created_at < author_info[author_name]["first_created_at"]:
                author_info[author_name]["first_created_at"] = created_at

    return list(author_info.values())

In [69]:
df = []
start_index = 0

In [None]:
import datetime

index = start_index
user_contribution_data = df
current_year = datetime.datetime.now().year
all_contributions = {}
with open("progress.pkl", "rb") as f:
    data = pickle.load(f)
    spam_prs = data["df"]
    unique_users = get_unique_authors_with_creation_time(spam_prs)
    for user in unique_users:
        username = user["author_name"]
        created_at: datetime = user.get("first_created_at")
        start_year = created_at.year
        for year in range(start_year, current_year + 1):
            try:
                response = execute_query(
                    username,
                    start=f"{year}-01-01T00:00:00Z",
                    end=f"{year}-12-31T23:59:59Z",
                )
                contributionsCollection = response["user"]["contributionsCollection"]
                df.append(
                    {
                        "username": username,
                        "year": year,
                        "url": response["user"]["url"],
                        "contributions_count": contributionsCollection[
                            "contributionCalendar"
                        ]["totalContributions"],
                        "account_created_at": created_at.strftime("%Y-%m-%dT%H:%M:%SZ"),
                        "total_commit_contributions": contributionsCollection[
                            "totalCommitContributions"
                        ],
                        "total_issue_contributions": contributionsCollection[
                            "totalIssueContributions"
                        ],
                        "total_pull_request_contributions": contributionsCollection[
                            "totalPullRequestContributions"
                        ],
                        "total_pull_request_review_contributions": contributionsCollection[
                            "totalPullRequestReviewContributions"
                        ],
                        "total_repositories_with_contributed_commits": contributionsCollection[
                            "totalRepositoriesWithContributedCommits"
                        ],
                        "total_repositories_with_contributed_issues": contributionsCollection[
                            "totalRepositoriesWithContributedIssues"
                        ],
                        "total_repositories_with_contributed_pull_request_reviews": contributionsCollection[
                            "totalRepositoriesWithContributedPullRequestReviews"
                        ],
                        "total_repositories_with_contributed_pull_requests": contributionsCollection[
                            "totalRepositoriesWithContributedPullRequests"
                        ],
                        "total_repository_contributions": contributionsCollection[
                            "totalRepositoryContributions"
                        ],
                        "restricted_contributions_count": contributionsCollection[
                            "restrictedContributionsCount"
                        ],
                    }
                )
                # Extract pr
                with open("user_contribution_count_progress.pkl", "wb") as f:
                    pickle.dump(
                        {"df": user_contribution_data, "start_index": index + 1}, f
                    )

            except Exception as e:
                print(f"Failed to retrieve data for username '{username}': {e}")
                # Save usercontrib-progress before terminating
                with open("user_contribution_count_progress.pkl", "wb") as f:
                    pickle.dump({"df": df, "start_index": index}, f)
                raise

In [None]:
from fileinput import filename
import pickle
import pandas as pd


def display_pkl_content(filepath):
    """
    Loads and displays the content of a .pkl (pickle) file in a Jupyter Notebook.
    Handles different data structures within the pickle file and provides informative output.
    """
    try:
        with open(filepath, "rb") as f:
            data = pickle.load(f)

        print(f"Content of {filepath}:\n")

        if isinstance(data, pd.DataFrame):
            print("Data is a Pandas DataFrame:")
            print(
                data.head().to_markdown(index=False, numalign="left", stralign="left")
            )  # Display first few rows as a Markdown table
            print("\nDataFrame Info:")
            data.info()  # Display DataFrame information
        elif isinstance(data, list):
            print("Data is a List:")
            if all(
                isinstance(item, dict) for item in data
            ):  # Check if list contains dictionaries
                df = pd.DataFrame(data)
                print(
                    df.head().to_markdown(index=False, numalign="left", stralign="left")
                )  # Display first few rows as a Markdown table
                print("\nDataFrame Info:")
                df.info()  # Display DataFrame information
            else:
                print(data[:10])  # Print first 10 items if not dictionaries
                print(f"\nList Length: {len(data)}")
        elif isinstance(data, dict):
            print("Data is a Dictionary:")
            pd.DataFrame(data["df"]).to_csv("user_contribution.csv", index=True)
            filename = "user_contribution_count.json"
            try:
                with open(filename, "w") as f:
                    json.dump(data['df'], f, indent=4)
                print(f"Data written to {filename} successfully.")
            except Exception as e:
                print(f"An error occurred: {e}")
            for key, value in data.items():
                print(f"\nKey: {key}")
                if isinstance(value, pd.DataFrame):
                    print("Value is a Pandas DataFrame:")
                    print(
                        value.head().to_markdown(
                            index=False, numalign="left", stralign="left"
                        )
                    )
                    print("\nDataFrame Info:")
                    value.info()
                elif isinstance(value, list):
                    print("Value is a List:")
                    print(value[:10])
                    print(f"\nList Length: {len(value)}")

                else:
                    print(f"Value: {value}")
        elif isinstance(data, set):
            print("Data is a Set:")
            print(list(data)[:10])
            print(f"\nSet Length: {len(data)}")

        else:
            print(f"Data is of type: {type(data)}")
            print(data)

    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
    except pickle.UnpicklingError:
        print(
            f"Error: Could not unpickle data from {filepath}. The file might be corrupted or use a different pickle protocol."
        )
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


# Example usage:
filepath = "user_contribution_count_progress.pkl"  # Replace with the actual path to your .pkl file
display_pkl_content(filepath)