In [2]:
import os
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from collections import Counter

In [3]:
# Load environment variables from .env file
load_dotenv()

True

In [4]:
dir_name = os.getenv('DIR_NAME')
base_path = f'../../data/processed/{dir_name}'
os.makedirs(base_path, exist_ok=True)

In [5]:
def get_repositories_data(user_ids, recorded_data):
    """
    Retrieves repositories data for a given list of user IDs.

    Parameters:
    - user_ids (list): List of user IDs.

    Returns:
    - repositories (dict): Dictionary containing repositories data.
        - 'user_id' (list): List of user IDs.
        - 'has_moz_owner' (list): List indicating whether a repository has a Mozilla owner.
        - 'fav_lang' (list): List of favorite programming languages for each user.
        - 'fav_topic' (list): List of favorite topics for each user.
    """
    
    token = os.getenv('GITHUB_TOKEN')
    headers = {'Authorization': f'Token {token}', 'Accept': 'application/vnd.github.v3+json'}
    
    params = {
        'per_page': 100,  # Number of items per page
        'page': 1,  # Initial page
    }

    repositories = {
        'user_id': [],
        'has_moz_owner': [],
        'fav_lang': [],
        'fav_topic': [],
    }
    
    owner_ids = []
    languages = []
    topics = []

    # Remove IDs already written to the file
    filter_ids = [id for id in user_ids if id not in recorded_data]
    user_ids = filter_ids

    total_id_list = len(user_ids)
    progress_bar = tqdm(total=total_id_list, desc="Processing") # Create a tqdm instance

    for user_id in user_ids:
        
        url = f"https://api.github.com/user/{user_id}/starred"
        
        while True:

            try:
                response = requests.get(url, headers=headers, params=params, timeout=1000)
            except requests.exceptions.Timeout:
                print(f"Request timed out.")
            
                progress_bar.close() 
                return repositories

            if response.status_code == 200:

                data = response.json()
                
                if len(data) > 0 :
                    
                    for repo in data:

                        owner_ids.append(repo['owner']['id'])
                        languages.append(repo['language'])

                        for topic in repo['topics']:
                            topics.append(topic)

                    if 'next' in response.links:
                        url = response.links['next']['url'] 
                        params = {}
                        
                    else:
                        repositories['user_id'].append(user_id)
                        repositories['has_moz_owner'].append(np.intersect1d(np.array(owner_ids), np.array(user_ids)).size > 0) 
                        
                        if len(languages) > 0:
                            repositories['fav_lang'].append(Counter(languages).most_common(1)[0][0])
                        else:
                            repositories['fav_lang'].append(None)
                        
                        if len(topics) > 0:
                            repositories['fav_topic'].append(Counter(topics).most_common(1)[0][0])
                        else:
                            repositories['fav_topic'].append(None)

                        owner_ids = []
                        languages = []
                        topics = []

                        break
                else: 
                    break
                        
            else:
                print('Request Error:', response.status_code)
                # break
                
        progress_bar.update(1)     # Update the progress bar
    
    progress_bar.close() # Close the progress bar
    return repositories



def check_existing_data(file_path):
    if not os.path.exists(file_path):
        return []
    
    data = pd.read_csv(file_path)
    id_list = list(set(data['id']))

    return id_list


In [6]:
users_ids = pd.read_csv(f"{base_path}/users_ids.csv", encoding='latin1')
users_ids = users_ids['user_id'].to_list()

filename = f"{base_path}/starred.csv"

recorded_users = check_existing_data(filename)
final_data = get_repositories_data(users_ids, recorded_users)

stared_data = pd.DataFrame(final_data)

Processing: 100%|██████████| 1768/1768 [52:45<00:00,  1.79s/it]  


In [7]:
# Save the extracted data
if os.path.exists(filename):
    existing_data = pd.read_csv(filename, encoding='latin1')
    updated_data = pd.concat([existing_data, stared_data], ignore_index=True)
else:
    updated_data = stared_data


updated_data.to_csv(filename, index=False, encoding='latin1')

print('Done')


Done
