In [1]:
import os
import requests
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from datetime import datetime

In [2]:
# Load environment variables from .env file
load_dotenv()

True

In [3]:
# Load data from CSV file
dir_name = os.getenv('DIR_NAME')
base_path = f'../../data/processed/{dir_name}'
os.makedirs(base_path, exist_ok=True)

users_ids = pd.read_csv(f'{base_path}/users_ids.csv', encoding='latin1')
users_ids = users_ids['user_id'].to_list()

In [4]:
def get_users_data(id_list):
    """
    Retrieves data for GitHub users based on their IDs.

    Args:
        id_list (list): List of GitHub user IDs.

    Returns:
        dict: Dictionary containing user data.
    """
    
    token = os.getenv('GITHUB_TOKEN')
    headers = {'Authorization': f'Token {token}', 'Accept': 'application/vnd.github.v3+json'}
    
    city = ['maputo', 'matola', 'gaza', 'inhambane', 'manica', 'sofala', 'tete', 'zambézia', 'nampula', 'niassa', 'cabo delgado']
    
    user = {
        "id": [],
        "city_id": [],
        "followers": [],
        "following": [],
        "created_at": [],
        "updated_at": [],
    }

    unknown_province = {
        "user_id": [],
        "location": [],
    }

    total_id_list = len(id_list)

    progress_bar = tqdm(total=total_id_list, desc="Processing") # Create a tqdm instance
    
    for i in id_list:
        url = f"https://api.github.com/user/{i}"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            
            for value in user:
                if value == "city_id": # If the key is "city_id", determine the city ID based on the user's location
                    city_id = ''
                    
                    for index, name in enumerate(city):
                        if name in data['location'].lower():
                            city_id = index
                            break
                    
                    user[value].append(city_id)

                    if city_id == '':
                        unknown_province['user_id'].append(i)
                        unknown_province['location'].append(data['location'])
                    
                elif value == "created_at" or value == "updated_at": # If the key is "created_at" or "updated_at", convert the date string to a datetime object and format it
                    date = datetime.strptime(data[value], "%Y-%m-%dT%H:%M:%SZ")
                    date = date.strftime('%Y-%m-%d')
                    
                    user[value].append(date) 
                    
                else:
                    user[value].append(data[value]) 

            progress_bar.update(1)     # Update the progress bar
                    
        else:
            print('Request Error:', response.status_code)
            break

    progress_bar.close() # Close the progress bar
    return user, unknown_province


final_data, unknown_province_data = get_users_data(users_ids)

# Create a DataFrame with the new data
filename = f"{base_path}/users_data.csv"
up_filename = f"{base_path}/unknown_province.csv"

users_data = pd.DataFrame(final_data)
up_data = pd.DataFrame(unknown_province_data)

users_data.to_csv(filename, index=False, encoding='latin1')
up_data.to_csv(up_filename, index=False, encoding='latin1')

print('Done')


Processing: 100%|██████████| 1734/1734 [26:05<00:00,  1.11it/s] 

Done



