# Extração de dados do Github

Pesquisando por iniciativas/projetos que utilizam Dados Abertos Governamentais através da [API do Github](https://developer.github.com/v3/)

In [3]:
import requests
import pandas as pd
import time
import logging
import sys

In [16]:
search_strings = [
            'dados abertos',
            'dados abertos brasil',
            'dados abertos governo',
            'dados abertos governamentais',
            'dados governamentais',
            'dados publicos abertos',
            'dados do governo',
            'analise de dados do governo',
            'analise de dados governamentais',
            'portal de dados do governo',
            'portal de dados governamentais',
            'portal publico do governo',
            'portal de dados abertos do governo',
        ]

Para a acesso a alguns recursos da API do github é preciso se autenticar, como aumentar o limite de requisições. Informações sobre autenticação podem ser encontradas [aqui](https://developer.github.com/v3/#authentication).

In [7]:
credentials = ('<user_name>','<token>')

Limite de requisições sem autenticação

In [8]:
t = requests.get('https://api.github.com/rate_limit')
t.json()

{'resources': {'core': {'limit': 60, 'remaining': 49, 'reset': 1580241507},
  'search': {'limit': 10, 'remaining': 10, 'reset': 1580239727},
  'graphql': {'limit': 0, 'remaining': 0, 'reset': 1580243267},
  'integration_manifest': {'limit': 5000,
   'remaining': 5000,
   'reset': 1580243267},
  'source_import': {'limit': 5, 'remaining': 5, 'reset': 1580239727}},
 'rate': {'limit': 60, 'remaining': 49, 'reset': 1580241507}}

Limite de requisições com autenticação

In [9]:
t = requests.get('https://api.github.com/rate_limit', auth=credentials)
t.json()

{'resources': {'core': {'limit': 5000, 'remaining': 4983, 'reset': 1580240230},
  'search': {'limit': 30, 'remaining': 30, 'reset': 1580239728},
  'graphql': {'limit': 5000, 'remaining': 5000, 'reset': 1580243268},
  'integration_manifest': {'limit': 5000,
   'remaining': 5000,
   'reset': 1580243268},
  'source_import': {'limit': 100, 'remaining': 100, 'reset': 1580239728}},
 'rate': {'limit': 5000, 'remaining': 4983, 'reset': 1580240230}}

Verificando limitação de extração de dados da API

In [10]:
page_35 = 'https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc&page=35'
t = requests.get(page_35, auth=credentials)
t.json()

{'message': 'Only the first 1000 search results are available',
 'documentation_url': 'https://developer.github.com/v3/search/'}

Informações sobre a ferramenta de pesquisa da API podem ser encontradas [aqui](https://developer.github.com/v3/search/)

In [11]:
url_base = 'https://api.github.com/search/repositories?q='

Podemos adicionar uma ordenação nos resultados, como quantidade de _stars_ de forma descrescente.

In [12]:
sort = '&sort=stars&order=desc'

## Extraindo informações gerais

In [18]:
def extract_results(data):
    
    items_list = []
    
    for item in data.get('items', None):
        
        
        item_dict = {
                'id': item.get('id'),
                'full_name': item.get('full_name', None),
                'description': item.get('description', None),      
                'owner_type': item.get('owner').get('type', None),
                'owner_api_url': item.get('owner').get('url', None),
                'owner_url': item.get('owner').get('html_url', None),
                'api_url': item.get('url', None),
                'url': item.get('html_url', None),
                'fork': item.get('fork', None),
                'created_at': item.get('created_at', None),
                'updated_at': item.get('updated_at', None),
                'pushed_at': item.get('pushed_at', None),
                'size': item.get('size', None),
                'stargazers_count': item.get('stargazers_count', None),
                'language': item.get('language', None),
                'has_issues': item.get('has_issues', None),
                'has_wiki': item.get('has_wiki', None),
                'forks_count': item.get('forks_count', None),
                'forks': item.get('forks', None),
                'open_issues': item.get('open_issues', None),
                # 'subscribers_count': item.get('subscribers_count', None),
                'license': item.get('license').get('name', None) if item.get('license', None) else None,
                'timestamp_extract': str(time.time()).split('.')[0]
        }

        items_list.append(item_dict)
            
    return items_list

In [19]:
def scroll_pages(url):
        
    results = requests.get(url, auth=credentials)    
    data = results.json()
    total = data.get('total_count', None)
        
    print(">>> Foram encontrados {0} resultados. Extraindo...".format(total))
    
    items_list = []
    items_list = extract_results(data)
        
    iterations = total // 30 
    
    for iteracao in range(0, iterations):        
        header = dict(results.links)
        
        if header.get('next', False):
            next_url = header.get('next').get('url')
                        
            results = requests.get(next_url, auth=credentials)
            data = results.json()
            
            items_list.append(extract_results(data))
    
    return items_list

In [20]:
%%time

items_list = []
repositories_df = None

for string in search_strings:
    url = url_base + string + sort
    print("\nPesquisando repositórios para a string: '{0}'".format(string))
    
    items_list = items_list + scroll_pages(url)
        
repositories_df = pd.DataFrame(items_list)


Pesquisando repositórios para a string: 'portal de dados do governo'
>>> Foram encontrados 10 resultados. Extraindo...

Pesquisando repositórios para a string: 'portal de dados governamentais'
>>> Foram encontrados 2 resultados. Extraindo...

Pesquisando repositórios para a string: 'portal publico do governo'
>>> Foram encontrados 1 resultados. Extraindo...

Pesquisando repositórios para a string: 'portal de dados abertos do governo'
>>> Foram encontrados 5 resultados. Extraindo...
CPU times: user 99.8 ms, sys: 7.09 ms, total: 107 ms
Wall time: 2.72 s


In [32]:
repositories_df = repositories_df.drop_duplicates(['id', 'api_url'])

In [33]:
repositories_df.describe()

Unnamed: 0,forks,forks_count,id,open_issues,size,stargazers_count
count,13.0,13.0,13.0,13.0,13.0,13.0
mean,0.615385,0.615385,126288300.0,0.538462,3618.615385,1.846154
std,1.38675,1.38675,67529950.0,0.877058,4677.442384,3.760456
min,0.0,0.0,24131580.0,0.0,1.0,0.0
25%,0.0,0.0,65902460.0,0.0,136.0,0.0
50%,0.0,0.0,135181900.0,0.0,3004.0,0.0
75%,1.0,1.0,195294600.0,1.0,4531.0,2.0
max,5.0,5.0,215799700.0,2.0,16753.0,13.0


Quantidade de colunas:

In [54]:
len(repositories_df.columns)

28

In [34]:
repositories_df['license']

0     GNU Affero General Public License v3.0
1     GNU Affero General Public License v3.0
2            GNU General Public License v3.0
3                                       None
4            GNU General Public License v3.0
5            GNU General Public License v3.0
6                                       None
7     GNU Affero General Public License v3.0
8                                       None
9                                       None
10                                      None
11                                      None
12                        Apache License 2.0
Name: license, dtype: object

## Extraindo _Commits_, _Contributors_ e dados do _Owner_

In [36]:
def extract_commits(url_repo):
    
    commits_url = url_repo + '/commits'  
    results = requests.get(commits_url, auth=credentials)
    
    # No caso do repositório estar vazio
    if results.status_code == 409:
        return None
    
    commits = len(results.json())

    header = dict(results.links)
    
    while header.get('next', False):
        next_url = header.get('next').get('url')        
        results = requests.get(next_url, auth=credentials)
        commits = commits + len(results.json())    
        header = dict(results.links)


    return commits

In [37]:
def extract_contributors(url_repo):
    
    contributors_url = url_repo + '/contributors'
    results = requests.get(contributors_url, auth=credentials)
    
    # No caso do repositório estar vazio
    if results.status_code == 204:
        return None
    
    contributors = len(results.json())

    header = dict(results.links)
    
    while header.get('next', False):
        next_url = header.get('next').get('url')
        results = requests.get(next_url, auth=credentials)
        contributors = contributors + len(results.json())
        header = dict(results.links)
    
    return contributors

In [38]:
def extract_owner_data(owner_api_url):
    
    results = requests.get(owner_api_url, auth=credentials)
    data = results.json()

    owner_data = {
        'owner_location': data.get('location', None),
        'owner_email': data.get('email', None),
        'owner_blog': data.get('blog', None),
        'owner_name': data.get('name', None)
    }
    
    return owner_data

In [49]:
%%time
urls = repositories_df['api_url']

for url in urls:

    owner_api_url = repositories_df.loc[repositories_df["api_url"] == url]['owner_api_url'].item()

    owner_data = extract_owner_data(owner_api_url)
    commits = extract_commits(url)
    contributors = extract_contributors(url)

    repositories_df.loc[repositories_df["api_url"] == url, 'commits'] = commits
    repositories_df.loc[repositories_df["api_url"] == url, 'contributors'] = contributors
    repositories_df.loc[repositories_df["api_url"] == url, 'owner_location'] = owner_data.get('owner_location', None)
    repositories_df.loc[repositories_df["api_url"] == url, 'owner_email'] = owner_data.get('owner_email', None)
    repositories_df.loc[repositories_df["api_url"] == url, 'owner_blog'] = owner_data.get('owner_blog', None)
    repositories_df.loc[repositories_df["api_url"] == url, 'owner_name'] = owner_data.get('owner_name', None)

CPU times: user 6.68 s, sys: 51.7 ms, total: 6.73 s
Wall time: 35.4 s


In [51]:
repositories_df.describe()

Unnamed: 0,forks,forks_count,id,open_issues,size,stargazers_count,commits,contributors
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,0.615385,0.615385,126288300.0,0.538462,3618.615385,1.846154,42.230769,1.307692
std,1.38675,1.38675,67529950.0,0.877058,4677.442384,3.760456,77.436161,0.85485
min,0.0,0.0,24131580.0,0.0,1.0,0.0,1.0,1.0
25%,0.0,0.0,65902460.0,0.0,136.0,0.0,7.0,1.0
50%,0.0,0.0,135181900.0,0.0,3004.0,0.0,15.0,1.0
75%,1.0,1.0,195294600.0,1.0,4531.0,2.0,22.0,1.0
max,5.0,5.0,215799700.0,2.0,16753.0,13.0,260.0,4.0


Agora devemos ter mais 6 colunas

In [52]:
len(repositories_df.columns)

28

In [50]:
repositories_df.head()

Unnamed: 0,api_url,created_at,description,fork,forks,forks_count,full_name,has_issues,has_wiki,id,...,stargazers_count,timestamp_extract,updated_at,url,commits,contributors,owner_location,owner_email,owner_blog,owner_name
0,https://api.github.com/repos/dadosgovbr/ckanex...,2016-11-22T11:22:49Z,Plugin / Tema do Portal Brasileiro de Dados Ab...,False,5,5,dadosgovbr/ckanext-dadosgovbr,True,True,74465492,...,13,1580239698,2019-06-05T18:18:52Z,https://github.com/dadosgovbr/ckanext-dadosgovbr,159.0,4.0,Brazil,,dados.gov.br,dados.gov.br
1,https://api.github.com/repos/thenets/ckanext-d...,2016-08-17T11:33:20Z,Plugin / Tema do Portal de Dados Abertos do Go...,False,1,1,thenets/ckanext-dadosabertos,True,False,65902460,...,2,1580239698,2017-08-28T09:53:00Z,https://github.com/thenets/ckanext-dadosabertos,22.0,2.0,Brazil,luiz@thenets.org,http://thenets.org,Luiz Felipe F M Costa
2,https://api.github.com/repos/mtrpires/raspafam...,2014-09-17T05:45:14Z,Raspador de dados do Portal da Transparência d...,False,0,0,mtrpires/raspafamilia,True,True,24131578,...,2,1580239698,2018-03-07T00:04:05Z,https://github.com/mtrpires/raspafamilia,7.0,1.0,Brazil/USA,mtrpires@outlook.com,http://bitcount.com.br,Marco Túlio Pires
3,https://api.github.com/repos/edsonlead/gastos-...,2018-05-28T15:56:42Z,"Raspagem de dados, do portal da transparência,...",False,1,1,edsonlead/gastos-gov-federal,True,True,135181861,...,1,1580239698,2018-06-02T01:41:16Z,https://github.com/edsonlead/gastos-gov-federal,25.0,1.0,Brazil,edsonlead@gmail.com,http://edsonlead.com,Edsonlead
4,https://api.github.com/repos/Macmod/PortalTran...,2019-07-17T00:29:43Z,Extrator de dados do portal da transparência d...,False,0,0,Macmod/PortalTransparenciaBR,True,True,197286043,...,0,1580239698,2019-07-17T14:12:56Z,https://github.com/Macmod/PortalTransparenciaBR,9.0,1.0,,zz.mcmd@gmail.com,,Artur Marzano


Conferindo valores nulos

In [55]:
repositories_df.loc[repositories_df['commits'].isnull()][['api_url', 'commits', 'contributors']]

Unnamed: 0,api_url,commits,contributors


In [56]:
repositories_df.loc[repositories_df['contributors'].isnull()][['api_url', 'commits', 'contributors']]

Unnamed: 0,api_url,commits,contributors


In [None]:
file_path = '../data/repositories_' + str(time.time()).split('.')[0] + '.csv'
file_path

In [None]:
repositories_df.to_csv(file_path, index=False)

## Extraindo contribuidores dos repositórios

In [4]:
def get_contributors(data):

    list_contributors = []

    for item in data:        
        contributor = {
            'contributor_id': item.get('id', None),
            'contributor_login': item.get('login', None),
            'contributor_type': item.get('type', None),
            'contributor_url': item.get('html_url', None),
            'contributor_api_url': item.get('url', None),
            'timestamp_extract': str(time.time()).split('.')[0]
        }

        list_contributors.append(contributor)

    return list_contributors

In [None]:
def scroll_contributors(results):

    list_contributors = []
    results = requests.get(url, auth=credentials)
    
    if results.status_code is 204:
        return None
    
    data = results.json()
    list_contributors = get_contributors(data)
    header = dict(results.links)
    
    while header.get('next', False):
        
        next_url = header.get('next').get('url')            
        results = requests.get(next_url, auth=credentials)
        data = results.json()
        list_contributors = list_contributors + get_contributors(data)  
        header = dict(results.links)
        
    return list_contributors

In [1]:
def search_contributors(repositories_df):
    
    urls = repositories_df['api_url']
    list_contributors_all_repo = []
    
    for url in urls:
        print('\nExtraindo contribuidores de: {0}'.format(url))
        
        repo_data = {
                'repo_id': repositories_df.loc[repositories_df["api_url"] == url, 'id'].values[0],
                'repo_name': repositories_df.loc[repositories_df["api_url"] == url, 'full_name'].values[0],
                'repo_url': repositories_df.loc[repositories_df["api_url"] == url, 'url'].values[0],
                'repo_api_url': url,
            }
        
        url_contributors = url + '/contributors'        
        contributors = scroll_contributors(url_contributors, repo_data)
        
        if contributors:
            list_contributors_all_repo = list_contributors_all_repo + contributors
    
    contributors_df = pd.Dataframe(list_contributors_all_repo)     
        
    return contributors_df

In [None]:
%%time
contributors_df = search_contributors(repositories_df)

In [None]:
contributors_df.describe()

In [None]:
contributors_df.head()

In [None]:
file_path = '../data/contributors_' + str(time.time()).split('.')[0] + '.csv'
file_path

In [None]:
result_contributors.to_csv(file_path, index=False)