In [1]:
import os
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from collections import Counter

In [2]:
# Load environment variables from .env file
load_dotenv()

True

In [3]:
dir_name = os.getenv('DIR_NAME')

users_ids = pd.read_csv(f"../../data/processed/{dir_name}/users_ids.csv", encoding='latin1')
users_ids = users_ids['user_id'].to_list()

In [4]:
def get_repositories_data(user_ids):
    """
    Retrieves repositories data for a given list of user IDs.

    Parameters:
    - user_ids (list): List of user IDs.

    Returns:
    - repositories (dict): Dictionary containing repositories data.
        - 'user_id' (list): List of user IDs.
        - 'has_moz_owner' (list): List indicating whether a repository has a Mozilla owner.
        - 'fav_lang' (list): List of favorite programming languages for each user.
        - 'fav_topic' (list): List of favorite topics for each user.
    """
    
    token = os.getenv('GITHUB_TOKEN')
    headers = {'Authorization': f'Token {token}', 'Accept': 'application/vnd.github.v3+json'}
    
    params = {
        'per_page': 100,  # Number of items per page
        'page': 1,  # Initial page
    }

    repositories = {
        'user_id': [],
        'has_moz_owner': [],
        'fav_lang': [],
        'fav_topic': [],
    }
    
    owner_ids = []
    languages = []
    topics = []

    total_id_list = len(users_ids)
    progress_bar = tqdm(total=total_id_list, desc="Processing") # Create a tqdm instance

    for user_id in user_ids:
        
        url = f"https://api.github.com/user/{user_id}/starred"
        
        while True:

            response = requests.get(url, headers=headers, params=params, timeout=600)
            # use "timeout=600" to prevent "RemoteDisconnected" error

            if response.status_code == 200:

                data = response.json()
                
                if len(data) > 0 :
                    
                    for repo in data:

                        owner_ids.append(repo['owner']['id'])
                        languages.append(repo['language'])

                        for topic in repo['topics']:
                            topics.append(topic)

                    if 'next' in response.links:
                        url = response.links['next']['url'] 
                        params = {}
                        
                    else:
                        repositories['user_id'].append(user_id)
                        repositories['has_moz_owner'].append(np.intersect1d(np.array(owner_ids), np.array(user_ids)).size > 0) 
                        
                        if len(languages) > 0:
                            repositories['fav_lang'].append(Counter(languages).most_common(1)[0][0])
                        else:
                            repositories['fav_lang'].append(None)
                        
                        if len(topics) > 0:
                            repositories['fav_topic'].append(Counter(topics).most_common(1)[0][0])
                        else:
                            repositories['fav_topic'].append(None)

                        owner_ids = []
                        languages = []
                        topics = []

                        break
                else: 
                    break
                        
            else:
                print('Request Error:', response.status_code)
                break
                
        progress_bar.update(1)     # Update the progress bar
    
    progress_bar.close() # Close the progress bar
    return repositories


# Example usage
final_data = get_repositories_data(users_ids)
print(f'final_data: {final_data}')


Processing:  77%|███████▋  | 1336/1727 [50:58<07:57,  1.22s/it]  

Request Error: 404


Processing: 100%|██████████| 1727/1727 [1:01:00<00:00,  2.12s/it]

final_data: {'user_id': [404506, 705308, 755840, 858200, 943711, 1241552, 1554715, 1572774, 1692858, 2129497, 3595515, 3806576, 3826227, 3955395, 4475142, 4643304, 4716778, 4808798, 5019560, 5045783, 6130991, 6144022, 6535422, 6688438, 6787828, 6823987, 6915391, 6969148, 6995340, 7068535, 7589719, 7609797, 7691358, 7803333, 8029526, 8167004, 8342748, 8362985, 8641905, 8768392, 8783921, 8825214, 8840165, 8881986, 8969075, 8999039, 9043223, 9259308, 9362127, 9496788, 9781670, 9819921, 10023220, 10654641, 10864249, 10880501, 10888023, 10922795, 11079693, 11230500, 11247633, 11255406, 11262620, 11368305, 11584640, 11927641, 11987675, 12021468, 12033296, 12100397, 12243288, 12692919, 12964071, 13021754, 13303659, 13402312, 13482373, 13621066, 13769987, 13836260, 13968552, 13984100, 14114666, 14132620, 14326204, 14907689, 14910228, 15047907, 15192264, 15213215, 15256665, 15413104, 16305023, 16611541, 17209308, 17267747, 17307013, 17637439, 17989477, 18194047, 18216969, 18285082, 18400142, 18




In [5]:
users_data = pd.DataFrame(final_data)
users_data.to_csv(f"../../data/processed/{dir_name}/starred.csv", index=False, encoding='latin1')