In [118]:
import requests
import json
import itertools
import os
from datetime import datetime
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import pandas as pd
import random
from tqdm import tqdm
import time

# Read tokens from a text file
tokens_file = "./env/tokens.txt"
with open(tokens_file, "r") as file:
    tokens = file.read().splitlines()

# Create an iterator to cycle through the tokens
token_iterator = itertools.cycle(tokens)
current_token = next(token_iterator)

In [119]:
# List of User-Agents for randomization
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

# Define headers to authenticate using the first token
headers = {
    "Authorization": f"Bearer {current_token}",
    "User-Agent": random.choice(user_agents),
}

# Setup GraphQL endpoint and client
graphql_url = "https://api.github.com/graphql"
transport = RequestsHTTPTransport(url=graphql_url, headers=headers, use_json=True)
client = Client(transport=transport, fetch_schema_from_transport=True)

In [None]:
# Test all tokens to verify their validity
def test_all_tokens():
    test_query = gql(
        """
        {
          viewer {
            login
          }
        }
        """
    )
    for i, token in enumerate(tokens):
        headers = {
            "Authorization": f"Bearer {token}",
            "User-Agent": random.choice(user_agents),
        }
        transport = RequestsHTTPTransport(
            url=graphql_url, headers=headers, use_json=True
        )
        client = Client(transport=transport, fetch_schema_from_transport=True)

        try:
            response = client.execute(test_query)
            print(
                f"Token {i+1}/{len(tokens)} is valid. Logged in as: {response['viewer']['login']}"
            )
        except Exception as e:
            print(f"Token {i+1}/{len(tokens)} failed with error: {e}")


# Run the token validation
test_all_tokens()

In [121]:
# Define the GraphQL query
query_template = gql(
    """
    query GetUserJoinDate($keyword: String!, $afterCursor: String, $first: Int) {
      organization(login: $keyword) {
        membersWithRole(first: $first, after: $afterCursor) { 
          totalCount
          edges {
            node {
              login
              createdAt
            }
          }
          pageInfo {
            hasNextPage
            endCursor
          }
        }
      }
    }
    """
)

In [122]:
# def get_contributor_count(repo_owner, repo_name):
#     global current_token
#     max_retries = 3
#     retries = 0
#     while retries < max_retries:
#         try:
#             # Randomize User-Agent for each query
#             headers["User-Agent"] = random.choice(user_agents)
#             headers["Authorization"] = f"Bearer {current_token}"
#             url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contributors?per_page=1&anon=true"
#             response = requests.get(url, headers=headers)
#             if response.status_code == 200:
#                 return int(response.headers.get("Link", "").split(",")[-1].split("&page=")[-1].split(">")[0]) if "Link" in response.headers else len(response.json())
#             elif response.status_code == 403:
#                 print(f"Rate limit exceeded, switching token... (Attempt {retries + 1}/{max_retries})")
#                 current_token = next(token_iterator)
#                 retries += 1
#             else:
#                 response.raise_for_status()
#         except Exception as e:
#             print(f"Error: {e}, retrying... (Attempt {retries + 1}/{max_retries})")
#             retries += 1
#     raise Exception("Max retries reached. Unable to complete the request.")

In [None]:
transport.headers = headers
# Check rate limit before executing the main query
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)
rate_limit_response = client.execute(rate_limit_query)
print(f"Rate limit: {rate_limit_response['rateLimit']}")

In [124]:
rate_limit_query = gql(
    """
    query {
      viewer {
        login
      }
      rateLimit {
        limit
        remaining
        used
        resetAt
      }
    }
    """
)


def execute_query(keyword, first=100, after_cursor=None):
    global current_token
    print(
        f"Executing query with keyword: {keyword}, first: {first}, afterCursor: {after_cursor}"
    )
    while True:
        try:
            # Randomize User-Agent for each query
            headers["User-Agent"] = random.choice(user_agents)
            transport.headers = headers
            # Check rate limit before executing the main query
            rate_limit_response = client.execute(rate_limit_query)
            remaining = rate_limit_response["rateLimit"]["remaining"]
            if remaining < 100:
                print(
                    f"Rate limit remaining ({remaining}) is below threshold. Switching token..."
                )
                # Set up to track whether we have cycled through all tokens
                all_tokens_checked = False
                initial_token = current_token

                while not all_tokens_checked:
                    # Switch to the next token
                    current_token = next(token_iterator)
                    headers["Authorization"] = f"Bearer {current_token}"
                    transport.headers = headers

                    # Check the rate limit of the new token
                    rate_limit_response = client.execute(rate_limit_query)
                    remaining = rate_limit_response["rateLimit"]["remaining"]

                    if remaining >= 100:
                        print(
                            f"Switched to a new token with sufficient rate limit ({remaining} remaining)."
                        )
                        break

                    # Check if we have cycled through all tokens
                    if current_token == initial_token:
                        print("All tokens are below threshold. Waiting for 1 hour...")
                        time.sleep(3600)
                        all_tokens_checked = True

                continue
            return client.execute(
                query_template,
                variable_values={
                    "keyword": keyword,
                    "first": first,
                    "afterCursor": after_cursor,
                },
            )
        except Exception as e:
            if "API rate limit" in str(e):
                print(
                    f"Rate limit reached: {e}, switching token... (Attempt with first {first})"
                )
                current_token = next(token_iterator)
                headers["Authorization"] = f"Bearer {current_token}"
            else:
                if first > 1:
                    first = max(1, first // 2)
                    print(
                        f"Error: {e}, reducing number of results and retrying... (Attempt with first {first})"
                    )
                else:
                    break
    print("Max retries reached. Sleeping for 60 minutes and switching token...")
    time.sleep(3600)
    current_token = next(token_iterator)
    headers["Authorization"] = f"Bearer {current_token}"
    transport.headers = headers
    return execute_query(keyword, first, after_cursor)

In [125]:
import datetime


def get_unique_authors_with_organizations(pull_requests):

    if not pull_requests:  # Handle empty or None input
        return []

    author_info = {}

    for pr in pull_requests:
        author_name = pr.get("author_name")
        created_at_str = pr.get("author_account_created_at")
        # Find the key that matches partially with 'author_organizations_as_at'
        organizations_key = next(
            (key for key in pr.keys() if key.startswith("author_organizations_as_at")),
            None,
        )
        organizations = pr.get(organizations_key, [])

        if not author_name or not created_at_str:
            print(f"Warning: Pull request missing 'author_name' or 'created_at': {pr}")
            continue  # Skip this PR if it's missing required data

        try:
            created_at = datetime.datetime.strptime(
            created_at_str, "%Y-%m-%dT%H:%M:%SZ"
            ).replace(tzinfo=datetime.timezone.utc)

        except ValueError:
            print(f"Warning: Invalid date format: {created_at_str} for PR: {pr}")
            continue

        if author_name not in author_info:
            author_info[author_name] = {
            "author_name": author_name,
            "first_created_at": created_at,
            "organizations": organizations,
            }
        else:
            if created_at < author_info[author_name]["first_created_at"]:
                author_info[author_name]["first_created_at"] = created_at
                author_info[author_name]["organizations"].update(
                organizations
                )

        # Convert the set of organizations back to a list for each author
        for author in author_info.values():
            author["organizations"] = list(author["organizations"])

    return list(author_info.values())

In [None]:
is_mining_spam = True

print("param:", is_mining_spam)

In [None]:

pr_data_pkl_filename = "progress.pkl" if is_mining_spam else "non-spam-progress.pkl"
orgs_data_pkl_filename = "user_orgs_progress.pkl" if is_mining_spam else "non-spam-user_orgs_progress.pkl"

In [None]:
import pickle

if os.path.exists(orgs_data_pkl_filename):
    with open(orgs_data_pkl_filename, "rb") as f:
        progress_data = pickle.load(f)
        df = progress_data["df"]
        start_index = progress_data["start_index"]
else:
    df = []
    start_index = 0

In [127]:
df = []
start_index = 0

In [None]:
import datetime
from os import close
import pickle

with open(pr_data_pkl_filename, "rb") as f:
    data = pickle.load(f)
    prs = data["df"]
users = get_unique_authors_with_organizations(prs)

unique_organizations = set()
for user in users:
    for org in user["organizations"]:
        unique_organizations.add(org["login"])

index = start_index
org_member_data = df
for organization_login in unique_organizations:
    try:
        after_cursor = None
        while True:
            response = execute_query(
                organization_login, first=10, after_cursor=after_cursor
            )
            if response.get("organization") is None:
                print(
                    f"Warning: organization with login {organization_login} not found, skipping..."
                )
                break

            members_with_role = response["organization"].get("membersWithRole")
            if members_with_role is None or members_with_role.get("totalCount", 0) == 0:
                print(
                    f"Warning: organization with login {organization_login} has no member, skipping..."
                )
                break

            # Extract pr
            for edge in members_with_role.get("edges", []):
                node = edge.get("node", {})
                df.append(
                    {
                        "organization_login": organization_login,
                        "member_login": node.get("login"),
                        "joined_at": node.get("createdAt"),
                    }
                )

            # Pagination
            page_info = response["organization"]["membersWithRole"]["pageInfo"]
            if page_info["hasNextPage"]:
                after_cursor = page_info["endCursor"]
            else:
                break
        with open(orgs_data_pkl_filename, "wb") as f:
            pickle.dump({"df": org_member_data, "start_index": index + 1}, f)

    except Exception as e:
        print(f"Failed to retrieve data for keywords '{user}': {e}")
        # Save progress before terminating
        with open(orgs_data_pkl_filename, "wb") as f:
            pickle.dump({"df": df, "start_index": index}, f)
        raise

In [None]:
import pickle
import pandas as pd


def display_pkl_content(filepath):
    """
    Display the content of a pickle file and save it as CSV and JSON files.
    This function reads a pickle file from the given filepath, extracts the data,
    and saves it as both a CSV and a JSON file. The CSV file is saved with the
    filename 'user_orgs.csv' and the JSON file is saved with the filename 'user_orgs.json'.
    Args:
        filepath (str): The path to the pickle file to be read.
    Raises:
        Exception: If there is an error reading the pickle file or writing the CSV/JSON files,
                   an exception is caught and an error message is printed.
    """

    try:
        with open(filepath, "rb") as f:
            data = pickle.load(f)

        print(f"Content of {filepath}:\n")
        filename = "user_orgs" if is_mining_spam else "non-spam-user_orgs"
        pd.DataFrame(data["df"]).to_csv(f"{filename}.csv", index=True)
        print(f"Data written to {filename}.csv successfully.")
        try:
            with open(f"{filename}.json", "w") as f:
                json.dump(data["df"], f, indent=4)
            print(f"Data written to {filename}.json successfully.")
        except Exception as e:
            print(f"An error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


filepath = orgs_data_pkl_filename
try:
    display_pkl_content(filepath)
except Exception as e:
    print(f"Error in cell: {e}")
    pass  # Skip the rest of the cell

In [None]:
from collections import defaultdict
from fileinput import filename

df = []

try:
    with open(pr_data_pkl_filename, "rb") as f:
        data = pickle.load(f)
        prs = data["df"]

    with open(orgs_data_pkl_filename, "rb") as f:
        org_data = pickle.load(f)
        organizations_with_members = org_data["df"]

    # Grouping by 'member_login'
    orgs_grouped_by_members = defaultdict(list)
    for entry in organizations_with_members:
        orgs_grouped_by_members[entry["member_login"]].append(entry)

    # Convert defaultdict to a normal dict
    orgs_grouped_by_members = dict(orgs_grouped_by_members)

    for pull_request in prs:
        author_name = pull_request.get("author_name")
        author_account_created_at = pull_request.get("author_account_created_at")
        organizations_key = next(
            (
                key
                for key in pull_request.keys()
                if key.startswith("author_organizations_as_at")
            ),
            None,
        )
        if author_name is None or author_account_created_at is None:
            continue
        df.append(
            {
                **pull_request,
                f"{organizations_key}": orgs_grouped_by_members.get(author_name, []),
            }
        )

    filename = (
        f"{"spam_data" if is_mining_spam else "non_spam_data"}_with_org_join_date"
    )

    pd.DataFrame(df).to_csv(f"{filename}.csv", index=True)
    print(f"Data written to {filename}.csv successfully.")

    with open(f"{filename}.json", "w") as f:
        json.dump(df, f, indent=4)
    print(f"Data written to {filename}.json successfully.")
except Exception as e:
    print(f"An error occurred: {e}")