## Entendendo como extrair informações sobre repositórios através da api do GitHub

Links: 
- [Search](https://developer.github.com/v3/search/)
- [Searching for repositories](https://help.github.com/en/articles/searching-for-repositories#search-by-repository-name-description-or-contents-of-the-readme-file)
- Lista de qualifiers:  https://help.github.com/en/articles/searching-code
- Documentação para search: https://developer.github.com/v3/search/

In [69]:
import requests
import pandas as pd
import time

Montando url de busca para os repositórios com maior quantidade de estrelas.
Usando como consulta stars:>1 para indicar que são todos os repositorios acima de 1 estrela, o que não tem impacto negativo para a busca pois existem certa de 3.305.358 repositórios e só vamos analisar 2.500.

In [70]:
q = 'q=stars:>1'
sort = '&sort=stars&order=desc'
url_base = 'https://api.github.com/search/repositories?'
url_final = url_base+q+sort
url_final

'https://api.github.com/search/repositories?q=stars:>1&sort=stars&order=desc'

In [94]:
colunas=['id',
         'owner_type', 
           'owner_url',
           'owner_html_url',
           'html_url',
           'url',
           'fork',
           'created_at',
           'updated_at',
           'size',
           'stargazers_count',
           'language',
           'has_issues',
           'has_wiki',
           'forks_count',
           'forks',
           'open_issues',
           'watchers']

In [95]:
resultados = pd.DataFrame(columns=['id',
                                   'owner_type', 
                                   'owner_url',
                                   'owner_html_url',
                                   'html_url',
                                   'url',
                                   'fork',
                                   'created_at',
                                   'updated_at',
                                   'size',
                                   'stargazers_count',
                                   'language',
                                   'has_issues',
                                   'has_wiki',
                                   'forks_count',
                                   'forks',
                                   'open_issues',
                                   'watchers',
                                   'commits',
                                   'contributors',
                                   'readme',])
resultados.head()

Unnamed: 0,id,owner_type,owner_url,owner_html_url,html_url,url,fork,created_at,updated_at,size,...,language,has_issues,has_wiki,forks_count,forks,open_issues,watchers,commits,contributors,readme


In [96]:
def add_resultado(item):
    df = pd.DataFrame([[
                        item.get('id'),
                        item.get('owner').get('type', None),
                        item.get('owner').get('url', None),
                        item.get('owner').get('html_url', None),
                        item.get('html_url', None),
                        item.get('url', None),
                        item.get('fork', None),
                        item.get('created_at', None),
                        item.get('updated_at', None),
                        item.get('size', None),
                        item.get('stargazers_count', None),
                        item.get('language', None),
                        item.get('has_issues', None),
                        item.get('has_wiki', None),
                        item.get('forks_count', None),
                        item.get('forks', None),
                        item.get('open_issues', None),
                        item.get('watchers', None)
                    ]], columns=colunas)

    return df    

In [97]:
def extrair_dados(data, resultados):
    for item in data.get('items', None):
        resultados = pd.concat([resultados, add_resultado(item)], ignore_index=True, sort=False)
        
    return resultados

In [98]:
def get_total_paginas(data):
    
    itens_por_pagina = len(data.get('items'))
    total_paginas = data.get('total_count') // itens_por_pagina # opreração de div em python
    total_paginas

    print('Total de registros:{0} , Registros por página:{1}, Total de Páginas:{2}'.format(
        data.get('total_count'), itens_por_pagina, total_paginas))
    
    return total_paginas        

In [99]:
def percorrendo_paginas(url):
    
    resultados = pd.DataFrame(columns=['id', 
                                       'owner_type', 
                                   'owner_url',
                                   'owner_html_url',
                                   'html_url',
                                   'url',
                                   'fork',
                                   'created_at',
                                   'updated_at',
                                   'size',
                                   'stargazers_count',
                                   'language',
                                   'has_issues',
                                   'has_wiki',
                                   'forks_count',
                                   'forks',
                                   'open_issues',
                                   'watchers',
                                   'commits',
                                   'contributors',
                                   'readme',])
    
    print('Extraindo página:1')
    results = requests.get(url)    
    data = dict(results.json())
    resultados = extrair_dados(data, resultados)

    # iteracoes = get_total_paginas(data)
    iteracoes = 84 # Para pegar em torno de 2500 registros, semelhante ao artigo original
    
    for iteracao in range(1, 34):
        
        # Para requisições não autenticadas a api restringe para 10 requisições por minuto,
        # Para requisições autenticadas 30 por minuto.
        if iteracao in [10, 20, 30, 40, 50, 60, 70, 80]:
            print("sleep 1 minuto")
            time.sleep(60)
        
        header = dict(results.links)
        next_url = header.get('next').get('url')
        print("----------> next url: {0}".format(next_url))
        
        print('Extraindo página:{0}'.format(iteracao+1))
        results = requests.get(next_url)
        print('Status:{0}'.format(results))
        
        data = dict(results.json())
        print('Data keys:{0}'.format(data.keys()))
            
        resultados = extrair_dados(data, resultados)
        
    return resultados

In [100]:
%%time
resultados = percorrendo_paginas(url_final)

Extraindo página:1
----------> next url: https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc&page=2
Extraindo página:2
Status:<Response [200]>
Data keys:dict_keys(['total_count', 'incomplete_results', 'items'])
----------> next url: https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc&page=3
Extraindo página:3
Status:<Response [200]>
Data keys:dict_keys(['total_count', 'incomplete_results', 'items'])
----------> next url: https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc&page=4
Extraindo página:4
Status:<Response [200]>
Data keys:dict_keys(['total_count', 'incomplete_results', 'items'])
----------> next url: https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc&page=5
Extraindo página:5
Status:<Response [200]>
Data keys:dict_keys(['total_count', 'incomplete_results', 'items'])
----------> next url: https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc

In [131]:
resultados.describe()

Unnamed: 0,id,owner_type,owner_url,owner_html_url,html_url,url,fork,created_at,updated_at,size,...,language,has_issues,has_wiki,forks_count,forks,open_issues,watchers,commits,contributors,readme
count,1020,1020,1020,1020,1020,1020,1020,1020,1020,1020,...,902,1020,1020,1020,1020,1020,1020,0.0,0.0,0.0
unique,990,2,839,839,990,990,1,990,972,978,...,40,2,2,924,924,470,963,0.0,0.0,0.0
top,32531480,Organization,https://api.github.com/users/google,https://github.com/google,https://github.com/gfwlist/gfwlist,https://api.github.com/repos/binux/pyspider,False,2014-06-27T05:29:02Z,2019-07-01T20:25:54Z,485,...,JavaScript,True,True,1719,1719,2,13137,,,
freq,2,615,16,16,2,2,1020,2,2,3,...,332,972,686,6,6,20,4,,,


In [132]:
resultados.to_csv('../dados/repositorios_com_id.csv', index=False)

In [133]:
resultados.columns.values

array(['id', 'owner_type', 'owner_url', 'owner_html_url', 'html_url',
       'url', 'fork', 'created_at', 'updated_at', 'size',
       'stargazers_count', 'language', 'has_issues', 'has_wiki',
       'forks_count', 'forks', 'open_issues', 'watchers', 'commits',
       'contributors', 'readme'], dtype=object)

## Extraindo através da biblioteca PyGithub

Links:
- https://github.com/PyGithub/PyGithub

In [38]:
from github import Github
from github import Repository
from github import ContentFile

In [57]:
token = 'token'

In [58]:
g = Github(token)

In [60]:
repos = g.search_repositories('stars:>1', sort='stars', order='desc')

In [None]:
for repo in repos:
    repo.full_name

In [62]:
resultados_csv = pd.read_csv('../dados/repositorios.csv')
resultados_csv.head()

Unnamed: 0,owner_type,owner_url,owner_html_url,html_url,url,fork,created_at,updated_at,size,stargazers_count,language,has_issues,has_wiki,forks_count,forks,open_issues,watchers,commits,contributors,readme
0,Organization,https://api.github.com/users/freeCodeCamp,https://github.com/freeCodeCamp,https://github.com/freeCodeCamp/freeCodeCamp,https://api.github.com/repos/freeCodeCamp/free...,False,2014-12-24T17:49:19Z,2019-07-01T19:10:15Z,117391,303611,JavaScript,True,False,22169,22169,1491,303611,,,
1,User,https://api.github.com/users/996icu,https://github.com/996icu,https://github.com/996icu/996.ICU,https://api.github.com/repos/996icu/996.ICU,False,2019-03-26T07:31:14Z,2019-07-01T16:02:38Z,59169,246365,Rust,False,False,21333,21333,16695,246365,,,
2,Organization,https://api.github.com/users/vuejs,https://github.com/vuejs,https://github.com/vuejs/vue,https://api.github.com/repos/vuejs/vue,False,2013-07-29T03:24:51Z,2019-07-01T18:52:14Z,27673,142614,JavaScript,True,True,20575,20575,307,142614,,,
3,Organization,https://api.github.com/users/twbs,https://github.com/twbs,https://github.com/twbs/bootstrap,https://api.github.com/repos/twbs/bootstrap,False,2011-07-29T21:19:00Z,2019-07-01T19:09:47Z,143718,134321,JavaScript,True,False,65901,65901,364,134321,,,
4,Organization,https://api.github.com/users/facebook,https://github.com/facebook,https://github.com/facebook/react,https://api.github.com/repos/facebook/react,False,2013-05-24T16:15:54Z,2019-07-01T19:21:06Z,143031,131888,JavaScript,True,True,24377,24377,740,131888,,,


In [63]:
repo = g.get_repo(28457823)
repo

Repository(full_name="freeCodeCamp/freeCodeCamp")

In [64]:
co = repo.get_collaborators()

In [66]:
for c in co:
    print(c.login)

GithubException: 403 {'message': 'Must have push access to view repository collaborators.', 'documentation_url': 'https://developer.github.com/v3/repos/collaborators/#list-collaborators'}