# Extração de dados do Github

Pesquisando por iniciativas/projetos que utilizam Dados Abertos Governamentais através da [API do Github](https://developer.github.com/v3/)

In [1]:
import requests
import pandas as pd
import time
import logging
import sys

In [2]:
search_strings = [
            'dados abertos',
            'dados abertos brasil',
            'dados abertos governo',
            'dados abertos governamentais',
            'dados governamentais',
            'dados publicos abertos',
            'dados do governo',
            'analise de dados do governo',
            'analise de dados governamentais',
            'portal de dados do governo',
            'portal de dados governamentais',
            'portal publico do governo',
            'portal de dados abertos do governo',
        ]

Para a acesso a alguns recursos da API do github é preciso se autenticar, como aumentar o limite de requisições. Informações sobre autenticação podem ser encontradas [aqui](https://developer.github.com/v3/#authentication).

In [9]:
credentials = ('<user_name>','<token>')

In [10]:
t = requests.get('https://api.github.com/rate_limit')
t.json()

{'rate': {'limit': 60, 'remaining': 57, 'reset': 1579879098},
 'resources': {'core': {'limit': 60, 'remaining': 57, 'reset': 1579879098},
  'graphql': {'limit': 0, 'remaining': 0, 'reset': 1579882078},
  'integration_manifest': {'limit': 5000,
   'remaining': 5000,
   'reset': 1579882078},
  'search': {'limit': 10, 'remaining': 10, 'reset': 1579878538},
  'source_import': {'limit': 5, 'remaining': 5, 'reset': 1579878538}}}

In [11]:
t = requests.get('https://api.github.com/rate_limit', auth=credentials)
t.json()

{'rate': {'limit': 5000, 'remaining': 5000, 'reset': 1579882080},
 'resources': {'core': {'limit': 5000, 'remaining': 5000, 'reset': 1579882080},
  'graphql': {'limit': 5000, 'remaining': 5000, 'reset': 1579882080},
  'integration_manifest': {'limit': 5000,
   'remaining': 5000,
   'reset': 1579882080},
  'search': {'limit': 30, 'remaining': 30, 'reset': 1579878540},
  'source_import': {'limit': 100, 'remaining': 100, 'reset': 1579878540}}}

Verificando limitação de extração de dados da API

In [12]:
page_35 = 'https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc&page=35'
t = requests.get(page_35, auth=credentials)
t.json()

{'documentation_url': 'https://developer.github.com/v3/search/',
 'message': 'Only the first 1000 search results are available'}

Informações sobre a ferramenta de pesquisa da API podem ser encontradas [aqui](https://developer.github.com/v3/search/)

In [13]:
url_base = 'https://api.github.com/search/repositories?q='

Podemos adicionar uma ordenação nos resultados, como quantidade de _stars_ de forma descrescente.

In [14]:
sort = '&sort=stars&order=desc'

## Extraindo informações gerais

In [15]:
columns=[
        'id',
        'full_name',
        'description',
        'owner_type', 
        'owner_api_url',
        'owner_url',
        'url',
        'api_url',
        'fork',
        'created_at',
        'updated_at',
        'size',
        'stargazers_count',
        'language',
        'has_issues',
        'has_wiki',
        'forks_count',
        'forks',
        'open_issues',
        'watchers',
        'timestamp_extract'
    ]


In [16]:
def add_result(item):
    
    df = pd.DataFrame([[
                        item.get('id'),
                        item.get('full_name', None),
                        item.get('description', None),      
                        item.get('owner').get('type', None),
                        item.get('owner').get('url', None),
                        item.get('owner').get('html_url', None),
                        item.get('html_url', None),
                        item.get('url', None),
                        item.get('fork', None),
                        item.get('created_at', None),
                        item.get('updated_at', None),
                        item.get('size', None),
                        item.get('stargazers_count', None),
                        item.get('language', None),
                        item.get('has_issues', None),
                        item.get('has_wiki', None),
                        item.get('forks_count', None),
                        item.get('forks', None),
                        item.get('open_issues', None),
                        item.get('watchers', None),
                        str(time.time()).split('.')[0]]], columns=columns)

    return df    

In [17]:
def extract_results(data, results):
    
    for item in data.get('items', None):
        
        results = pd.concat([results, add_result(item)], ignore_index=True, sort=False)

    return results

In [18]:
repositories_df = pd.DataFrame(columns=['id',
                                    'full_name',
                                    'description',
                                    'owner_type', 
                                    'owner_api_url',
                                    'owner_url',
                                    'url',
                                    'api_url',
                                    'fork',
                                    'created_at',
                                    'updated_at',
                                    'size',
                                    'stargazers_count',
                                    'language',
                                    'has_issues',
                                    'has_wiki',
                                    'forks_count',
                                    'forks',
                                    'open_issues',
                                    'watchers',
                                    'timestamp_extract',
                                    'commits',
                                    'contributors',]) 

In [19]:
def scroll_pages(url, repositories_df):
    
    print('\nPrimeira requisição')
    
    results = requests.get(url, auth=credentials)    
    data = dict(results.json())
    total = data.get('total_count', None)
        
    print(">>> Foram encontrados {0} resultados. Extraindo...".format(total))

    repositories_df = extract_results(data, repositories_df)
    
    iterations = total // 30 
    
    for iteracao in range(0, iterations):        
        header = dict(results.links)
        
        if header.get('next', False):
            next_url = header.get('next').get('url')
            
            print("\nNext url: {0}".format(next_url))
            
            results = requests.get(next_url, auth=credentials)
            data = dict(results.json())
            repositories_df = extract_results(data, repositories_df)
        
    return repositories_df

In [None]:
%%time

for string in search_strings:
    url = url_base + string + sort
    print("\nExtraindo repositórios para a string: '{0}'".format(string))
    repositories_df = scroll_pages(url, repositories_df)

In [None]:
repositories_df.describe()

## Extraindo Commits e Contributors

In [None]:
def extract_commits(url_repo):
    
    commits_url = url_repo + '/commits'  
    results = requests.get(commits_url, auth=credentials)
    
    # No caso do repositório estar vazio
    if results.status_code == 409:
        return None
    
    commits = len(results.json())

    header = dict(results.links)
    
    while header.get('next', False):
        next_url = header.get('next').get('url')        
        results = requests.get(next_url, auth=credentials)
        commits = commits + len(results.json())    
        header = dict(results.links)


    return commits

In [None]:
def extract_contributors(url_repo):
    
    contributors_url = url_repo + '/contributors'
    results = requests.get(contributors_url, auth=credentials)
    
    # No caso do repositório estar vazio
    if results.status_code == 204:
        return None
    
    contributors = len(results.json())

    header = dict(results.links)
    
    while header.get('next', False):
        next_url = header.get('next').get('url')
        results = requests.get(next_url, auth=credentials)
        contributors = contributors + len(results.json())
        header = dict(results.links)
    
    return contributors

In [None]:
%%time
urls = repositories_df['api_url']

for url in urls:
        
    print('\n>>> ', url)
    
    repo = requests.get(url, auth=credentials)
    repo = dict(repo.json())
        
    commits = extract_commits(url)
    contributors = extract_contributors(url)

    print("Tem {0} Commits - {1} Contributors".format(commits,contributors))

    repositories_df.loc[repositories_df["api_url"] == url, 'commits'] = commits
    repositories_df.loc[repositories_df["api_url"] == url, 'contributors'] = contributors

In [None]:
repositories_df.describe()

Conferindo valores nulos

In [None]:
repositories_df.loc[repositories_df['commits'].isnull()][['api_url', 'commits', 'contributors']]

In [None]:
repositories_df.loc[repositories_df['contributors'].isnull()][['api_url', 'commits', 'contributors']]

In [None]:
file_path = '../data/repositories_' + str(time.time()).split('.')[0] + '.csv'
file_path

In [None]:
repositories_df.to_csv(file_path, index=False)

## Extraindo contribuidores dos repositórios

In [None]:
# todo: Extrair locations

In [None]:
columns_contributors = [ 'repo_id',
                          'repo_name',
                          'repo_url',
                          'repo_api_url',
                          'contributor_id',
                          'contributor_login',
                          'contributor_type',
                          'contributor_url',
                          'contributor_api_url',
                          'timestamp_extract']

In [None]:
def scroll_results(results):
    
    list_contributors = []

    for result in results:
        contributor = {}
        
        contributor = {
            'contributor_id': result.get('id', None),
            'contributor_login': result.get('login', None),
            'contributor_type': result.get('type', None),
            'contributor_url': result.get('html_url', None),
            'contributor_api_url': result.get('url', None),
        }

        list_contributors.append(contributor)
        
    return list_contributors

In [None]:
def get_contributors(url):

    list_contributors = []
    results = requests.get(url, auth=credentials)
    
    # No caso do repositório estar vazio
    if results.status_code is 204:
        return None
    
    contributors_results = results.json()
    list_contributors = scroll_results(contributors_results)
    
    header = dict(results.links)
    
    while header.get('next', False):
        next_url = header.get('next').get('url')
        
        print('\t> Extraindo da próxima página: {0}'.format(next_url))
            
        results = requests.get(next_url, auth=credentials)
        contributors_results = results.json()
        list_contributors = list_contributors + scroll_results(contributors_results)    
        header = dict(results.links)
        
    return list_contributors

In [None]:
def add_contributor(repo, contributor):
            
    df = pd.DataFrame([[
                    repo.get('repo_id', None),
                    repo.get('repo_name', None),
                    repo.get('repo_url', None),      
                    repo.get('repo_api_url', None),
                    contributor.get('contributor_id', None),
                    contributor.get('contributor_login', None),
                    contributor.get('contributor_type', None),
                    contributor.get('contributor_url', None),
                    contributor.get('contributor_api_url', None),
                    str(time.time()).split('.')[0]]], columns=columns_contributors)

    return df  

In [None]:
def save_contributors(contributors_df, repo, contributors):
    
    for contributor in contributors:
                
        contributors_df = pd.concat([contributors_df, 
                                     add_contributor(repo, contributor)], 
                                    ignore_index=True, 
                                    sort=False)
    return contributors_df

In [None]:
def search_contributors(repositories_df):
    
    contributors_df = pd.DataFrame(columns = columns_contributors)
    urls = repositories_df['api_url']
   
    for url in urls:
        print('\nExtraindo contribuidores de: {0}'.format(url))

        repo = {
            'repo_id': repositories_df.loc[repositories_df["api_url"] == url, 'id'].values[0],
            'repo_name': repositories_df.loc[repositories_df["api_url"] == url, 'full_name'].values[0],
            'repo_url': repositories_df.loc[repositories_df["api_url"] == url, 'url'].values[0],
            'repo_api_url': url,
        }
        
        url_contributors = url + '/contributors'        
        contributors = get_contributors(url_contributors)
        
        if contributors:
            contributors_df = save_contributors(contributors_df, repo, contributors)
        
    
    return contributors_df

In [None]:
%%time
result_contributors = search_contributors(repositories_df)

In [None]:
result_contributors.describe()

In [None]:
result_contributors.head()

In [None]:
file_path = '../data/contributors_' + str(time.time()).split('.')[0] + '.csv'
file_path

In [None]:
result_contributors.to_csv(file_path, index=False)