# Pesquisando por iniciativas/projetos com Dados Abertos Governamentais no Github

In [1]:
import requests
import pandas as pd
import time

TODO:

**Mapeamento das principais bibliotecas**

- [API da Câmara dos deputados](https://dadosabertos.camara.leg.br/swagger/api.html)

In [2]:
search_strings_libraries = []

Devido a estrutura de pesquisa no github as palavras chaves utilizadas para o 
news não foram tão eficiente, visto que preciso adicionar palavras de busca ou 
de forma conter uma das palavras ou conter todas as palavras na sequencia especificada.

Sendo assim:
- '"projeto" and "dados governo"',
- '"projeto" and "dados governamentais"',
- '"projeto" and "monitora" and "dados" and "governo"'

Não foram buscas efetivas, porém as palavras chaves inicias sim.

In [3]:
search_strings = [
            'dados abertos',
            'dados abertos brasil',
            'dados abertos governo',
            'dados abertos governamentais',
            'dados governamentais',
            'dados publicos abertos',
            'dados do governo',
            'analise de dados do governo',
            'analise de dados governamentais',
            'portal de dados do governo',
            'portal de dados governamentais',
            'portal publico do governo',
            'portal de dados abertos do governo',
        ]

'consumir' e 'publicação' não retornaram resultados e por isso foram retirados a fim de adicionar palavras que retornem resultados melhores
- 'consumir dados abertos do governo'
- 'consumir dados abertos governamentais'
- 'publicação de dados abertos do governo',
- 'publicação de dados governamentais',

In [4]:
sort = '&sort=stars&order=desc'
url_base = 'https://api.github.com/search/repositories?q='
credentials=('lorenaps','ba6db863961f1221ef38f7294976f0dc8dc36731')

Verificando o limite de requisições

In [5]:
t = requests.get('https://api.github.com/rate_limit', auth=credentials)
t.json()

{'rate': {'limit': 5000, 'remaining': 5000, 'reset': 1573184151},
 'resources': {'core': {'limit': 5000, 'remaining': 5000, 'reset': 1573184151},
  'graphql': {'limit': 5000, 'remaining': 5000, 'reset': 1573184151},
  'integration_manifest': {'limit': 5000,
   'remaining': 5000,
   'reset': 1573184151},
  'search': {'limit': 30, 'remaining': 30, 'reset': 1573180611}}}

In [6]:
columns=[
        'id',
        'full_name',
        'description',
        'owner_type', 
        'owner_api_url',
        'owner_url',
        'url',
        'api_url',
        'fork',
        'created_at',
        'updated_at',
        'size',
        'stargazers_count',
        'language',
        'has_issues',
        'has_wiki',
        'forks_count',
        'forks',
        'open_issues',
        'watchers',
        'timestamp_extract'
    ]


In [7]:
def add_result(item):
    
    df = pd.DataFrame([[
                        item.get('id'),
                        item.get('full_name', None),
                        item.get('description', None),      
                        item.get('owner').get('type', None),
                        item.get('owner').get('url', None),
                        item.get('owner').get('html_url', None),
                        item.get('html_url', None),
                        item.get('url', None),
                        item.get('fork', None),
                        item.get('created_at', None),
                        item.get('updated_at', None),
                        item.get('size', None),
                        item.get('stargazers_count', None),
                        item.get('language', None),
                        item.get('has_issues', None),
                        item.get('has_wiki', None),
                        item.get('forks_count', None),
                        item.get('forks', None),
                        item.get('open_issues', None),
                        item.get('watchers', None),
                        str(time.time()).split('.')[0]]], columns=columns)

    return df    

In [8]:
def extract_results(data, results):
    
    for item in data.get('items', None):
        
        results = pd.concat([results, add_result(item)], ignore_index=True, sort=False)

    return results

Verificando limitação de extração de dados da API

In [11]:
page_35 = 'https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc&page=35'
t = requests.get(page_35, auth=credentials)
t.json()

{'documentation_url': 'https://developer.github.com/v3/search/',
 'message': 'Only the first 1000 search results are available'}

In [12]:
repositories_df = pd.DataFrame(columns=['id',
                                    'full_name',
                                    'description',
                                    'owner_type', 
                                    'owner_api_url',
                                    'owner_url',
                                    'url',
                                    'api_url',
                                    'fork',
                                    'created_at',
                                    'updated_at',
                                    'size',
                                    'stargazers_count',
                                    'language',
                                    'has_issues',
                                    'has_wiki',
                                    'forks_count',
                                    'forks',
                                    'open_issues',
                                    'watchers',
                                    'timestamp_extract',
                                    'commits',
                                    'contributors',]) 

In [14]:
def scroll_pages(url, repositories_df):
    
    print('\nPrimeira requisição')
    
    results = requests.get(url, auth=credentials)    
    data = dict(results.json())
    total = data.get('total_count', None)
        
    print(">>> Foram encontrados {0} resultados. Extraindo...".format(total))

    repositories_df = extract_results(data, repositories_df)
    
    iterations = total // 30 
    
    for iteracao in range(0, iterations):        
        header = dict(results.links)
        
        if header.get('next', False):
            next_url = header.get('next').get('url')
            
            print("\nNext url: {0}".format(next_url))
            
            results = requests.get(next_url, auth=credentials)
            data = dict(results.json())
            repositories_df = extract_results(data, repositories_df)
        
    return repositories_df

In [15]:
%%time

for string in search_strings:
    url = url_base + string + sort
    print("\nExtraindo repositórios para a string: '{0}'".format(string))
    repositories_df = scroll_pages(url, repositories_df)


Extraindo repositórios para a string: 'dados abertos'

Primeira requisição
>>> Foram encontrados 370 resultados. Extraindo...

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&order=desc&page=2

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&order=desc&page=3

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&order=desc&page=4

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&order=desc&page=5

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&order=desc&page=6

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&order=desc&page=7

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&order=desc&page=8

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&order=desc&page=9

Next url: https://api.github.com/search/repositories?q=dados+abertos&sort=stars&

In [17]:
repositories_df.describe()

Unnamed: 0,id,full_name,description,owner_type,owner_api_url,owner_url,url,api_url,fork,created_at,...,language,has_issues,has_wiki,forks_count,forks,open_issues,watchers,timestamp_extract,commits,contributors
count,570,570,540,570,570,570,570,570,570,570,...,422,570,570,570,570,570,570,570,0.0,0.0
unique,420,420,388,2,362,362,420,420,1,420,...,22,2,2,16,16,14,25,30,0.0,0.0
top,65902460,thenets/ckanext-dadosabertos,Plugin / Tema do Portal de Dados Abertos do Go...,User,https://api.github.com/users/dadosgovbr,https://github.com/dadosgovbr,https://github.com/prodest/ckanext-data_es_theme,https://api.github.com/repos/prodest/ckanext-d...,False,2016-08-17T11:33:20Z,...,Python,True,True,0,0,0,0,1573180746,,
freq,6,6,6,461,16,16,6,6,570,6,...,93,554,554,424,424,488,377,30,,


## Extraindo Commits e Contributors

In [18]:
def extract_commits(url_repo):
    
    commits_url = url_repo + '/commits'  
    results = requests.get(commits_url, auth=credentials)
    
    # No caso do repositório estar vazio
    if results.status_code is 409:
        return None
    
    commits = len(results.json())

    header = dict(results.links)
    
    while header.get('next', False):
        next_url = header.get('next').get('url')        
        results = requests.get(next_url, auth=credentials)
        commits = commits + len(results.json())    
        header = dict(results.links)


    return commits

In [19]:
def extract_contributors(url_repo):
    
    contributors_url = url_repo + '/contributors'
    results = requests.get(contributors_url, auth=credentials)
    
    # No caso do repositório estar vazio
    if results.status_code is 204:
        return None
    
    contributors = len(results.json())

    header = dict(results.links)
    
    while header.get('next', False):
        next_url = header.get('next').get('url')
        results = requests.get(next_url, auth=credentials)
        contributors = contributors + len(results.json())
        header = dict(results.links)
    
    return contributors

In [None]:
urls = repositories_df['api_url']

for url in urls:
        
    print('\n>>> ', url)
    
    repo = requests.get(url, auth=credentials)
    repo = dict(repo.json())
        
    commits = extract_commits(url)
    contributors = extract_contributors(url)

    print("Tem {0} Commits - {1} Contributors".format(commits,contributors))

    repositories_df.loc[repositories_df["api_url"] == url, 'commits'] = commits
    repositories_df.loc[repositories_df["api_url"] == url, 'contributors'] = contributors


>>>  https://api.github.com/repos/CamaraDosDeputados/dados-abertos
Tem 28 Commits - 4 Contributors

>>>  https://api.github.com/repos/dadosgovbr/catalogos-dados-brasil
Tem 42 Commits - 4 Contributors

>>>  https://api.github.com/repos/prefeiturasp/dados-educacao
Tem 18 Commits - 2 Contributors

>>>  https://api.github.com/repos/dadosgovbr/aplicativos-dados-brasil
Tem 40 Commits - 5 Contributors

>>>  https://api.github.com/repos/dadosgovbr/kit
Tem 361 Commits - 6 Contributors

>>>  https://api.github.com/repos/mapaslivres/localidades
Tem 69 Commits - 5 Contributors

>>>  https://api.github.com/repos/odufrn/odufrn-downloader
Tem 237 Commits - 6 Contributors

>>>  https://api.github.com/repos/brasilopen/brasilopen
Tem 10 Commits - 2 Contributors

>>>  https://api.github.com/repos/vitorbaptista/dados-abertos-camara.gov.br
Tem 31 Commits - 1 Contributors

>>>  https://api.github.com/repos/jonny-data/conheca-seu-vereador
Tem 70 Commits - 6 Contributors

>>>  https://api.github.com/repos/da

Tem 85 Commits - 2 Contributors

>>>  https://api.github.com/repos/vgeorge/estados-brasileiros
Tem 1 Commits - 1 Contributors

>>>  https://api.github.com/repos/paraibatransparente/dados
Tem 95 Commits - 2 Contributors

>>>  https://api.github.com/repos/CodeForCuritiba/Analises-Jupyter-Notebook
Tem 2 Commits - 1 Contributors

>>>  https://api.github.com/repos/grstavares/DadosAbertosTSE
Tem 5 Commits - 1 Contributors

>>>  https://api.github.com/repos/febr-team/febr-data
Tem 53 Commits - 1 Contributors

>>>  https://api.github.com/repos/brunousml/Maquiavel
Tem 7 Commits - 1 Contributors

>>>  https://api.github.com/repos/chris-redfield/open-up-with-simplicity
Tem 83 Commits - 3 Contributors

>>>  https://api.github.com/repos/fredbortolato/dados-abertos-alesp
Tem 15 Commits - 1 Contributors

>>>  https://api.github.com/repos/erickos/Case_BibliotecaUFRN_DadosAbertos
Tem 10 Commits - 1 Contributors

>>>  https://api.github.com/repos/Ermesoml/Dados-Abertos-Camara-Legislativa
Tem 32 Commits 

Tem 8 Commits - 1 Contributors

>>>  https://api.github.com/repos/GabrielLimaSnT/ProjetoOpeDadosAbertos
Tem 2 Commits - None Contributors

>>>  https://api.github.com/repos/georgemaia/dadosabertos
Tem 31 Commits - 1 Contributors

>>>  https://api.github.com/repos/petersonjr/dados_abertos
Tem 1 Commits - 1 Contributors

>>>  https://api.github.com/repos/GabrielLimaSnT/ProjetoDadosAbertosImpacta
Tem 74 Commits - 3 Contributors

>>>  https://api.github.com/repos/natalialionel/Dados-Abertos-Desafios-e-Oportunidades
Tem 1 Commits - 1 Contributors

>>>  https://api.github.com/repos/mhbsti/dados-abertos-curitiba
Tem 28 Commits - 1 Contributors

>>>  https://api.github.com/repos/transparencia-mg/blog-dados-abertos
Tem 4 Commits - 1 Contributors

>>>  https://api.github.com/repos/alexvlima/MTur-Dados-Turismo-por-UF
Tem 2 Commits - 1 Contributors

>>>  https://api.github.com/repos/thyall/Dados_UFRN
Tem 23 Commits - 3 Contributors

>>>  https://api.github.com/repos/dcarvalho/analise_dados_abertos

Tem 18 Commits - 1 Contributors

>>>  https://api.github.com/repos/febr-team/febr-catalog
Tem 47 Commits - 1 Contributors

>>>  https://api.github.com/repos/bozoh/meu-legislador
Tem 25 Commits - 1 Contributors

>>>  https://api.github.com/repos/marceloandriolli/dados_solarimetricos_brasil
Tem 3 Commits - 1 Contributors

>>>  https://api.github.com/repos/ggdrn/ClimMapView
Tem 10 Commits - 1 Contributors

>>>  https://api.github.com/repos/crislanio/Dados_abertos_PortalDaTransparencia
Tem 5 Commits - 1 Contributors

>>>  https://api.github.com/repos/mauriciovll/Dados-Abertos-SC-versao-TESTE-
Tem 1 Commits - 1 Contributors

>>>  https://api.github.com/repos/mateushtoledo/sysvis
Tem 3 Commits - 1 Contributors

>>>  https://api.github.com/repos/DeehSlash/EstruturaDeDadosHash
Tem 1 Commits - 1 Contributors

>>>  https://api.github.com/repos/danielCavalcanti553/angular-api-dados-iffar
Tem 5 Commits - 1 Contributors

>>>  https://api.github.com/repos/georgemaia/dadosabertos-mobilidade-rn-natal


Tem 1 Commits - 1 Contributors

>>>  https://api.github.com/repos/juliafealves/leda-tabela-hash-aberto
Tem 5 Commits - 0 Contributors

>>>  https://api.github.com/repos/BrunoMoriB/stockmarket-manager
Tem 23 Commits - 1 Contributors

>>>  https://api.github.com/repos/hiperorganicos/openlab_datavis
Tem 5 Commits - 1 Contributors

>>>  https://api.github.com/repos/code-like-a-girl/hacker-cidadao-3.0
Tem 2 Commits - 1 Contributors

>>>  https://api.github.com/repos/NastyaCodingBunny/PYTHON---SIMULADOR-DE-CADASTRO
Tem 1 Commits - 1 Contributors

>>>  https://api.github.com/repos/wharborges/analiseES
Tem 2 Commits - None Contributors

>>>  https://api.github.com/repos/VRPazdeJesus/multas-ibama
Tem 2 Commits - 1 Contributors

>>>  https://api.github.com/repos/decosoares/diariasservidoresalagoas


In [None]:
resultados.head(5)

In [None]:
file_path = '../data/repositories_' + str(time.time()).split('.')[0] + '.csv'
file_path

In [None]:
resultados.to_csv(file_path, index=False)

## Extraindo contribuidores dos repositórios

In [None]:
columns_contributors = [ 'repo_id',
                          'repo_name',
                          'repo_url',
                          'repo_api_url',
                          'contributor_id',
                          'contributor_login',
                          'contributor_type',
                          'contributor_url',
                          'contributor_api_url',
                          'timestamp_extract']

In [None]:
def scroll_results(results):
    
    list_contributors = []

    for result in results:
        contributor = {}
        
        contributor = {
            'contributor_id': result.get('id', None),
            'contributor_login': result.get('login', None),
            'contributor_type': result.get('type', None),
            'contributor_url': result.get('html_url', None),
            'contributor_api_url': result.get('url', None),
        }

        list_contributors.append(contributor)
        
    return list_contributors

In [None]:
def get_contributors(url):

    list_contributors = []
    results = requests.get(url, auth=credentials)
    
    # No caso do repositório estar vazio
    if results.status_code is 204:
        return None
    
    contributors_results = results.json()
    list_contributors = scroll_results(contributors_results)
    
    header = dict(results.links)
    
    while header.get('next', False):
        next_url = header.get('next').get('url')
        
        print('\t> Extraindo da próxima página: {0}'.format(next_url))
            
        results = requests.get(next_url, auth=credentials)
        contributors_results = results.json()
        list_contributors = list_contributors + scroll_results(contributors_results)    
        header = dict(results.links)
        
    return list_contributors

In [None]:
def add_contributor(repo, contributor):
            
    df = pd.DataFrame([[
                    repo.get('repo_id', None),
                    repo.get('repo_name', None),
                    repo.get('repo_url', None),      
                    repo.get('repo_api_url', None),
                    contributor.get('contributor_id', None),
                    contributor.get('contributor_login', None),
                    contributor.get('contributor_type', None),
                    contributor.get('contributor_url', None),
                    contributor.get('contributor_api_url', None),
                    str(time.time()).split('.')[0]]], columns=columns_contributors)

    return df  

In [None]:
def save_contributors(contributors_df, repo, contributors):
    
    for contributor in contributors:
                
        contributors_df = pd.concat([contributors_df, 
                                     add_contributor(repo, contributor)], 
                                    ignore_index=True, 
                                    sort=False)
    return contributors_df

In [None]:
def search_contributors(repositories_df):
    
    contributors_df = pd.DataFrame(columns = columns_contributors)
    urls = repositories_df['api_url']
   
    for url in urls[:5]:
        print('\nExtraindo contribuidores de: {0}'.format(url))

        repo = {
            'repo_id': repositories_df.loc[repositories_df["api_url"] == url, 'id'].values[0],
            'repo_name': repositories_df.loc[repositories_df["api_url"] == url, 'full_name'].values[0],
            'repo_url': repositories_df.loc[repositories_df["api_url"] == url, 'url'].values[0],
            'repo_api_url': url,
        }
        
        url_contributors = url + '/contributors'        
        contributors = get_contributors(url_contributors)
        
        if contributors:
            contributors_df = save_contributors(contributors_df, repo, contributors)
        
    
    return contributors_df

In [None]:
%%time
result_contributors = search_contributors(resultados)

In [None]:
result_contributors.describe()

In [None]:
file_path = '../data/contributors_' + str(time.time()).split('.')[0] + '.csv'
file_path

In [None]:
result_contributors.to_csv(file_path, index=False)