In [18]:
import requests
import json
import itertools
import os
from datetime import datetime
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import pandas as pd
import random
from tqdm import tqdm
import time

# Read tokens from a text file
tokens_file = "./env/tokens.txt"
with open(tokens_file, "r") as file:
    tokens = file.read().splitlines()

# Create an iterator to cycle through the tokens
token_iterator = itertools.cycle(tokens)
current_token = next(token_iterator)

In [19]:
# List of User-Agents for randomization
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
]

# Define headers to authenticate using the first token
headers = {
    "Authorization": f"Bearer {current_token}",
    "User-Agent": random.choice(user_agents)
}

# Setup GraphQL endpoint and client
graphql_url = "https://api.github.com/graphql"
transport = RequestsHTTPTransport(url=graphql_url, headers=headers, use_json=True)
client = Client(transport=transport, fetch_schema_from_transport=True)

In [None]:
# Test all tokens to verify their validity
def test_all_tokens():
    test_query = gql(
        """
        {
          viewer {
            login
          }
        }
        """
    )
    for i, token in enumerate(tokens):
        headers = {
            "Authorization": f"Bearer {token}",
            "User-Agent": random.choice(user_agents)
        }
        transport = RequestsHTTPTransport(url=graphql_url, headers=headers, use_json=True)
        client = Client(transport=transport, fetch_schema_from_transport=True)
        
        try:
            response = client.execute(test_query)
            print(f"Token {i+1}/{len(tokens)} is valid. Logged in as: {response['viewer']['login']}")
        except Exception as e:
            print(f"Token {i+1}/{len(tokens)} failed with error: {e}")

# Run the token validation
test_all_tokens()

In [21]:
# Define the GraphQL query
query_template = gql(
    """
    query searchRepositories($keyword: String!, $afterCursor: String, $first: Int) {
      search(query: $keyword, type: REPOSITORY, first: $first, after: $afterCursor) {
        repositoryCount
        edges {
          cursor
          node {
            ... on Repository {
              name
              description
              repositoryTopics(first: 100) {
                            edges {
                                node {
                                    topic {
                                        name
                                    }
                                }
                            }
                        }
              url
              stargazers {
                totalCount
              }
              forks {
                totalCount
              }
              watchers {
                totalCount
              }
              issues(states: OPEN) {
                totalCount
              }
              issuesClosed: issues(states: CLOSED) {
                totalCount
              }
              pullRequests(states: OPEN) {
                totalCount
              }
              pullRequestsClosed: pullRequests(states: CLOSED) {
                totalCount
              }
              defaultBranchRef {
                target {
                  ... on Commit {
                    history {
                      totalCount
                    }
                  }
                }
              }
              createdAt
              pushedAt
              releases {
                totalCount
              }
            }
          }
        }
        pageInfo {
          endCursor
          hasNextPage
        }
      }
    }
    """
)


In [22]:
# def get_contributor_count(repo_owner, repo_name):
#     global current_token
#     max_retries = 3
#     retries = 0
#     while retries < max_retries:
#         try:
#             # Randomize User-Agent for each query
#             headers["User-Agent"] = random.choice(user_agents)
#             headers["Authorization"] = f"Bearer {current_token}"
#             url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contributors?per_page=1&anon=true"
#             response = requests.get(url, headers=headers)
#             if response.status_code == 200:
#                 return int(response.headers.get("Link", "").split(",")[-1].split("&page=")[-1].split(">")[0]) if "Link" in response.headers else len(response.json())
#             elif response.status_code == 403:
#                 print(f"Rate limit exceeded, switching token... (Attempt {retries + 1}/{max_retries})")
#                 current_token = next(token_iterator)
#                 retries += 1
#             else:
#                 response.raise_for_status()
#         except Exception as e:
#             print(f"Error: {e}, retrying... (Attempt {retries + 1}/{max_retries})")
#             retries += 1
#     raise Exception("Max retries reached. Unable to complete the request.")

In [None]:
transport.headers = headers
# Check rate limit before executing the main query
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)
rate_limit_response = client.execute(rate_limit_query)
print(f"Rate limit: {rate_limit_response['rateLimit']}")

In [24]:
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)
def execute_query(keyword, first=100, after_cursor=None):
    global current_token
    while True:
        try:
            # Randomize User-Agent for each query
            headers["User-Agent"] = random.choice(user_agents)
            transport.headers = headers
            # Check rate limit before executing the main query
            rate_limit_response = client.execute(rate_limit_query)
            remaining = rate_limit_response["rateLimit"]["remaining"]
            if remaining < 100:
                print(f"Rate limit remaining ({remaining}) is below threshold. Switching token...")
                # Set up to track whether we have cycled through all tokens
                all_tokens_checked = False
                initial_token = current_token
            
                while not all_tokens_checked:
                    # Switch to the next token
                    current_token = next(token_iterator)
                    headers["Authorization"] = f"Bearer {current_token}"
                    transport.headers = headers
                    
                    # Check the rate limit of the new token
                    rate_limit_response = client.execute(rate_limit_query)
                    remaining = rate_limit_response["rateLimit"]["remaining"]
            
                    if remaining >= 100:
                        print(f"Switched to a new token with sufficient rate limit ({remaining} remaining).")
                        break
            
                    # Check if we have cycled through all tokens
                    if current_token == initial_token:
                        print("All tokens are below threshold. Waiting for 1 hour...")
                        time.sleep(3600)
                        all_tokens_checked = True
            
                continue
            return client.execute(query_template, variable_values={"keyword": keyword, "first": first, "afterCursor": after_cursor})
        except Exception as e:
            if "API rate limit" in str(e):
                print(f"Rate limit reached: {e}, switching token... (Attempt with first {first})")
                current_token = next(token_iterator)
                headers["Authorization"] = f"Bearer {current_token}"
            else:
                if first > 1:
                    first = max(1, first // 2)
                    print(f"Error: {e}, reducing number of results and retrying... (Attempt with first {first})")
                else:
                    break
    print("Max retries reached. Sleeping for 60 minutes and switching token...")
    time.sleep(3600)
    current_token = next(token_iterator)
    headers["Authorization"] = f"Bearer {current_token}"
    transport.headers = headers
    return execute_query(keyword, first, after_cursor)

In [25]:
import pickle
if os.path.exists('progress.pkl'):
    with open('progress.pkl', 'rb') as f:
        progress_data = pickle.load(f)
        df = progress_data['df']
        start_index = progress_data['start_index']
else:
    df = []
    start_index = 0

In [31]:
import datetime
# template
keywords = ['spam']
index = start_index
se_fm_repository_data = df
for keyword in tqdm(keywords):
    search_keyword = f'"{keyword}" in:name,readme,description,topics created:>=2024-11-30 fork:false is:public archived:false size:>0 stars:>=5 sort:stars-desc'
    try:
        after_cursor = None
        while True:
            response = execute_query(search_keyword, first=100, after_cursor=after_cursor)
            if response['search']['repositoryCount'] == 0:
                break
            # Extract repositories
            for edge in response['search']['edges']:
                repo = edge['node']
                df.append({
                    "name": repo["name"],
                    "description": repo["description"],
                    "url": repo["url"],
                    "topics": [topic["node"]["topic"]["name"] for topic in repo["repositoryTopics"]["edges"]],
                    "stars": repo["stargazers"]["totalCount"],
                    "forks": repo["forks"]["totalCount"],
                    "watchers": repo["watchers"]["totalCount"],
                    "open_issues": repo["issues"]["totalCount"],
                    "closed_issues": repo["issuesClosed"]["totalCount"],
                    "total_issues": repo["issues"]["totalCount"] + repo["issuesClosed"]["totalCount"],
                    "open_pull_requests": repo["pullRequests"]["totalCount"],
                    "closed_pull_requests": repo["pullRequestsClosed"]["totalCount"],
                    "total_pull_requests": repo["pullRequests"]["totalCount"] + repo["pullRequestsClosed"]["totalCount"],
                    "commits": repo["defaultBranchRef"]["target"]["history"]["totalCount"],
                    "created_at": repo["createdAt"],
                    "last_commit": repo["pushedAt"],
                    "releases": repo["releases"]["totalCount"]
                })

            # Pagination
            page_info = response['search']['pageInfo']
            if page_info['hasNextPage']:
                after_cursor = page_info['endCursor']
            else:
                break
        with open('progress.pkl', 'wb') as f:
            pickle.dump({'df': se_fm_repository_data, 'start_index': index + 1}, f)

    except Exception as e:
        print(f"Failed to retrieve data for keywords '{keyword}': {e}")
        # Save progress before terminating
        with open('progress.pkl', 'wb') as f:
            pickle.dump({'df': df, 'start_index': index}, f)
        raise


100%|██████████| 1/1 [00:10<00:00, 10.05s/it]


In [32]:
import pickle
import pandas as pd

def display_pkl_content(filepath):
    """
    Loads and displays the content of a .pkl (pickle) file in a Jupyter Notebook.
    Handles different data structures within the pickle file and provides informative output.
    """
    try:
        with open(filepath, 'rb') as f:
            data = pickle.load(f)

        print(f"Content of {filepath}:\n")

        if isinstance(data, pd.DataFrame):
            print("Data is a Pandas DataFrame:")
            print(data.head().to_markdown(index=False, numalign="left", stralign="left")) # Display first few rows as a Markdown table
            print("\nDataFrame Info:")
            data.info() # Display DataFrame information
        elif isinstance(data, list):
            print("Data is a List:")
            if all(isinstance(item, dict) for item in data): # Check if list contains dictionaries
              df = pd.DataFrame(data)
              print(df.head().to_markdown(index=False, numalign="left", stralign="left")) # Display first few rows as a Markdown table
              print("\nDataFrame Info:")
              df.info() # Display DataFrame information
            else:
              print(data[:10]) # Print first 10 items if not dictionaries
              print(f"\nList Length: {len(data)}")
        elif isinstance(data, dict):
            print("Data is a Dictionary:")
            for key, value in data.items():
                print(f"\nKey: {key}")
                if isinstance(value, pd.DataFrame):
                    print("Value is a Pandas DataFrame:")
                    print(value.head().to_markdown(index=False, numalign="left", stralign="left"))
                    print("\nDataFrame Info:")
                    value.info()
                elif isinstance(value, list):
                    print("Value is a List:")
                    print(value[:10])
                    print(f"\nList Length: {len(value)}")
                else:
                    print(f"Value: {value}")
        elif isinstance(data, set):
          print("Data is a Set:")
          print(list(data)[:10])
          print(f"\nSet Length: {len(data)}")

        else:
            print(f"Data is of type: {type(data)}")
            print(data)

    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
    except pickle.UnpicklingError:
        print(f"Error: Could not unpickle data from {filepath}. The file might be corrupted or use a different pickle protocol.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
filepath = 'progress.pkl'  # Replace with the actual path to your .pkl file
display_pkl_content(filepath)

Content of progress.pkl:

Data is a Dictionary:

Key: df
Value is a List:
[{'name': 'Solana-Raydium-Bundler', 'description': 'Raydium bundler, raydium bundler with jito, Raydium bundler, Raydium bundler', 'url': 'https://github.com/g0drlc/Solana-Raydium-Bundler', 'topics': ['raydium', 'raydium-bundler-bot', 'spl-token'], 'stars': 131, 'forks': 113, 'watchers': 1, 'open_issues': 0, 'closed_issues': 0, 'total_issues': 0, 'open_pull_requests': 0, 'closed_pull_requests': 0, 'total_pull_requests': 0, 'commits': 5, 'created_at': '2024-12-05T08:48:15Z', 'last_commit': '2024-12-23T14:42:11Z', 'releases': 0}, {'name': 'tirreno', 'description': 'Open source security user analytics platform. Get started - free.', 'url': 'https://github.com/TirrenoTechnologies/tirreno', 'topics': ['analytics', 'fraud-detection', 'fraud-management', 'fraud-prevention', 'intelligence', 'intranet', 'php', 'privacy', 'uba', 'web-analytics', 'self-hosted', 'behavior-analytics', 'php-project', 'rules-engine', 'audit-tra