In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
from tqdm import tqdm

In [2]:
# Load environment variables from .env file
load_dotenv()

dir_name = os.getenv('DIR_NAME')
base_path = f'../../data/processed/{dir_name}'
os.makedirs(base_path, exist_ok=True)

In [3]:
def get_repository_data(user_ids):
    """
    Retrieves repository data for a list of user IDs from the GitHub API.

    Args:
        user_ids (list): List of GitHub user IDs.

    Returns:
        dict: Dictionary containing repository data.
    """
    
    token = os.getenv('GITHUB_TOKEN')
    headers = {'Authorization': f'Token {token}', 'Accept': 'application/vnd.github.v3+json'}
    
    params = {
        'per_page': 100,  # Number of items per page
        'page': 1,  # Initial page
    }

    # Dictionary to store repository data
    repositories = {
        'id': [],
        'owner_id': [],
        'language': [],
        'created_at': [],
        'updated_at': [],
        'stargazers_count': [],
        'forks_count': [],
    }
    
    total_uid = len(user_ids)
    uid_progress_bar = tqdm(total=total_uid, desc="Progress Bar")
    
    
    for u_id in user_ids:
        
        url = f"https://api.github.com/user/{u_id}/repos"
        
        while True:

            response = requests.get(url, headers=headers, params=params, timeout=1000)
            # use "timeout=600" to prevent "RemoteDisconnected" error

            if response.status_code == 200:

                data = response.json()

                for repo in data:

                    for key in repositories:
                        if key == "owner_id":
                            repositories['owner_id'].append(repo['owner']['id'])

                        elif key == "created_at" or key == "updated_at":
                            date = datetime.strptime(repo[key], "%Y-%m-%dT%H:%M:%SZ")
                            date = date.strftime('%Y-%m-%d')
                            repositories[key].append(date)

                        else:
                            repositories[key].append(repo[key])

                if 'next' in response.links:
                    url = response.links['next']['url']  # Next page of results
                    params = {}  # No longer need the page parameters
                else:
                    break

            else:
                print('Request Error:', response.status_code)
                # break
                
        uid_progress_bar.update(1)

    uid_progress_bar.close() 
    return repositories


In [4]:
# Load existing data from CSV file into a DataFrame
users_ids = pd.read_csv(f"{base_path}/users_ids.csv", encoding='latin1')
users_ids = users_ids['user_id'].to_list()

# Retrieve repository data for user IDs
final_data = get_repository_data(users_ids)


Progress Bar: 100%|██████████| 1755/1755 [47:30<00:00,  1.62s/it] 


In [5]:
# Save repository data to a CSV file
file_name = f"{base_path}/repos_data.csv"
users_data = pd.DataFrame(final_data)
users_data.to_csv(file_name, index=False, encoding='latin1')

# print(f'final_data: {final_data}')