## Entendendo como extrair informações sobre repositórios através da api do GitHub

Links: 
- [Search](https://developer.github.com/v3/search/)
- [Searching for repositories](https://help.github.com/en/articles/searching-for-repositories#search-by-repository-name-description-or-contents-of-the-readme-file)
- Lista de qualifiers:  https://help.github.com/en/articles/searching-code
- Documentação para search: https://developer.github.com/v3/search/

In [16]:
import requests
import pandas as pd
import time
from datetime import datetime

Montando url de busca para os repositórios com maior quantidade de estrelas.
Usando como consulta stars:>1 para indicar que são todos os repositorios acima de 1 estrela, o que não tem impacto negativo para a busca pois existem certa de 3.305.358 repositórios e só vamos analisar 2.500.

In [17]:
q = 'q=stars:>1'
sort = '&sort=stars&order=desc'
url_base = 'https://api.github.com/search/repositories?'
url_final = url_base+q+sort
url_final

'https://api.github.com/search/repositories?q=stars:>1&sort=stars&order=desc'

In [18]:
colunas=['id',
         'owner_type', 
           'owner_url',
           'owner_html_url',
           'html_url',
           'url',
           'fork',
           'created_at',
           'updated_at',
           'size',
           'stargazers_count',
           'language',
           'has_issues',
           'has_wiki',
           'forks_count',
           'forks',
           'open_issues',
           'watchers']

In [19]:
resultados = pd.DataFrame(columns=['id',
                                   'owner_type', 
                                   'owner_url',
                                   'owner_html_url',
                                   'html_url',
                                   'url',
                                   'fork',
                                   'created_at',
                                   'updated_at',
                                   'size',
                                   'stargazers_count',
                                   'language',
                                   'has_issues',
                                   'has_wiki',
                                   'forks_count',
                                   'forks',
                                   'open_issues',
                                   'watchers',
                                   'commits',
                                   'contributors',
                                   'readme',])
resultados.head()

Unnamed: 0,id,owner_type,owner_url,owner_html_url,html_url,url,fork,created_at,updated_at,size,...,language,has_issues,has_wiki,forks_count,forks,open_issues,watchers,commits,contributors,readme


In [20]:
def add_resultado(item):
    df = pd.DataFrame([[
                        item.get('id'),
                        item.get('owner').get('type', None),
                        item.get('owner').get('url', None),
                        item.get('owner').get('html_url', None),
                        item.get('html_url', None),
                        item.get('url', None),
                        item.get('fork', None),
                        item.get('created_at', None),
                        item.get('updated_at', None),
                        item.get('size', None),
                        item.get('stargazers_count', None),
                        item.get('language', None),
                        item.get('has_issues', None),
                        item.get('has_wiki', None),
                        item.get('forks_count', None),
                        item.get('forks', None),
                        item.get('open_issues', None),
                        item.get('watchers', None)
                    ]], columns=colunas)

    return df    

In [21]:
def extrair_dados(data, resultados):
    for item in data.get('items', None):
        resultados = pd.concat([resultados, add_resultado(item)], ignore_index=True, sort=False)
        
    return resultados

In [22]:
def get_total_paginas(data):
    
    itens_por_pagina = len(data.get('items'))
    total_paginas = data.get('total_count') // itens_por_pagina # opreração de div em python
    total_paginas

    print('Total de registros:{0} , Registros por página:{1}, Total de Páginas:{2}'.format(
        data.get('total_count'), itens_por_pagina, total_paginas))
    
    return total_paginas        

In [23]:
def percorrendo_paginas(url):
    
    resultados = pd.DataFrame(columns=['id', 
                                       'owner_type', 
                                   'owner_url',
                                   'owner_html_url',
                                   'html_url',
                                   'url',
                                   'fork',
                                   'created_at',
                                   'updated_at',
                                   'size',
                                   'stargazers_count',
                                   'language',
                                   'has_issues',
                                   'has_wiki',
                                   'forks_count',
                                   'forks',
                                   'open_issues',
                                   'watchers',
                                   'commits',
                                   'contributors',
                                   'readme',])
    
    print('Extraindo página:1')
    results = requests.get(url)    
    data = dict(results.json())
    resultados = extrair_dados(data, resultados)

    # iteracoes = get_total_paginas(data)
    iteracoes = 84 # Para pegar em torno de 2500 registros, semelhante ao artigo original
    
    for iteracao in range(1, 34):
        print("\n>>>>>>>> Iteracao:{0}".format(iteracao+1))
        print("Tempo atual:{0}".format(datetime.now()))

        # Para requisições não autenticadas a api restringe para 10 requisições por minuto,
        # Para requisições autenticadas 30 por minuto.
        if iteracao in [10, 20, 30, 40, 50, 60, 70, 80]:
            print("sleep 1 minuto")
            time.sleep(60)
            
        #if iteracao in [34, 64]:
         #   print("Salvando dados atuais")
          #  resultados.to_csv('../dados/repositorios_atuais.csv', index=False)
           # 
            #print("sleep 1 hora")
            #print("Tempo atual:{0}".format(datetime.now()))
            #time.sleep(3600)
        
        header = dict(results.links)
        next_url = header.get('next').get('url')
        print("Next url extraída: {0}".format(next_url))
        
        print('Extraindo página:{0}'.format(iteracao+1))
        results = requests.get(next_url)
        print('Status:{0}'.format(results))
        
        data = dict(results.json())
            
        resultados = extrair_dados(data, resultados)
        
    return resultados

In [None]:
%%time
resultados = percorrendo_paginas(url_final)

In [20]:
resultados.describe()

Unnamed: 0,id,owner_type,owner_url,owner_html_url,html_url,url,fork,created_at,updated_at,size,...,language,has_issues,has_wiki,forks_count,forks,open_issues,watchers,commits,contributors,readme
count,1020,1020,1020,1020,1020,1020,1020,1020,1020,1020,...,907,1020,1020,1020,1020,1020,1020,0.0,0.0,0.0
unique,812,2,691,691,812,812,1,812,800,801,...,36,2,2,752,752,399,780,0.0,0.0,0.0
top,69662720,Organization,https://api.github.com/users/google,https://github.com/google,https://github.com/froala/design-blocks,https://api.github.com/repos/mxgmn/WaveFunctio...,False,2010-09-22T06:16:55Z,2019-07-03T19:58:43Z,358204,...,JavaScript,True,True,1016,1016,2,21086,,,
freq,4,625,21,21,4,4,1020,4,5,4,...,323,966,678,7,7,21,7,,,


In [23]:
resultados.to_csv('../dados/repositorios_812.csv', index=False)

"Only the first 1000 search results are available"

In [26]:
page_35 = requests.get("https://api.github.com/search/repositories?q=stars%3A%3E1&sort=stars&order=desc&page=35")
page_35.json()

{'documentation_url': 'https://developer.github.com/v3/search/',
 'message': 'Only the first 1000 search results are available'}

In [27]:
resultados.columns.values

array(['id', 'owner_type', 'owner_url', 'owner_html_url', 'html_url',
       'url', 'fork', 'created_at', 'updated_at', 'size',
       'stargazers_count', 'language', 'has_issues', 'has_wiki',
       'forks_count', 'forks', 'open_issues', 'watchers', 'commits',
       'contributors', 'readme'], dtype=object)

## Extraindo através da biblioteca PyGithub

Links:
- https://github.com/PyGithub/PyGithub

In [24]:
from github import Github
from github import Repository
from github import ContentFile

In [75]:
token = 'token'

In [36]:
resultados_csv = pd.read_csv('../dados/repositorios_com_id.csv')
resultados_csv.head()

Unnamed: 0,id,owner_type,owner_url,owner_html_url,html_url,url,fork,created_at,updated_at,size,...,language,has_issues,has_wiki,forks_count,forks,open_issues,watchers,commits,contributors,readme
0,28457823,Organization,https://api.github.com/users/freeCodeCamp,https://github.com/freeCodeCamp,https://github.com/freeCodeCamp/freeCodeCamp,https://api.github.com/repos/freeCodeCamp/free...,False,2014-12-24T17:49:19Z,2019-07-01T20:27:35Z,117391,...,JavaScript,True,False,22169,22169,1489,303613,,,
1,177736533,User,https://api.github.com/users/996icu,https://github.com/996icu,https://github.com/996icu/996.ICU,https://api.github.com/repos/996icu/996.ICU,False,2019-03-26T07:31:14Z,2019-07-01T16:02:38Z,59169,...,Rust,False,False,21334,21334,16695,246365,,,
2,11730342,Organization,https://api.github.com/users/vuejs,https://github.com/vuejs,https://github.com/vuejs/vue,https://api.github.com/repos/vuejs/vue,False,2013-07-29T03:24:51Z,2019-07-01T18:52:14Z,27673,...,JavaScript,True,True,20575,20575,307,142614,,,
3,2126244,Organization,https://api.github.com/users/twbs,https://github.com/twbs,https://github.com/twbs/bootstrap,https://api.github.com/repos/twbs/bootstrap,False,2011-07-29T21:19:00Z,2019-07-01T19:09:47Z,143722,...,JavaScript,True,False,65902,65902,365,134321,,,
4,10270250,Organization,https://api.github.com/users/facebook,https://github.com/facebook,https://github.com/facebook/react,https://api.github.com/repos/facebook/react,False,2013-05-24T16:15:54Z,2019-07-01T19:57:38Z,143031,...,JavaScript,True,True,24380,24380,740,131890,,,


In [37]:
resultados_csv.id.astype(str).describe()

count         1020
unique         990
top       49594603
freq             2
Name: id, dtype: object

In [42]:
id_repos = resultados_csv.id

In [None]:
for id_repo in id_repos:
    repo = g.get_repo(id_repo)
    print("\nNome do repositório:{0}".format(repo.name))
    print(repo.get_commits().totalCount)
    print(repo.get_contributors().totalCount)

In [47]:
repo = g.get_repo("freeCodeCamp/freeCodeCamp")
repo.get_contributors().total

419

In [51]:
c = repo.get_contributors()

In [56]:
c.totalCount

419

Pegando readme

In [None]:
repo = g.get_repo("freeCodeCamp")
contents = repo.get_contents("README.md")
print(contents.decoded_content)

In [57]:
import requests
from bs4 import BeautifulSoup as bs

p = requests.get('https://github.com/freeCodeCamp/freeCodeCamp')
print(p.text)







<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars0.githubusercontent.com">
  <link rel="dns-prefetch" href="https://avatars1.githubusercontent.com">
  <link rel="dns-prefetch" href="https://avatars2.githubusercontent.com">
  <link rel="dns-prefetch" href="https://avatars3.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">



  <link crossorigin="anonymous" media="all" integrity="sha512-67V2J9Se2CifJlftk9/cExHGvxd7N9b9EdGnQEpszu99Ogeecilu9jIDxoCkx3zNLfB9ArraXW0J03qyVmN0Uw==" rel="stylesheet" href="https://github.githubassets.com/assets/frameworks-e7318add1f7e055d040edb0f75aaa0ba.css" />
  <link crossorigin="anonymous" media="all" integrity="sha512-MRlTIqIyb8caK5+o8llXVntXovciHyAM4qE3kWU2S7SIjAPDxYp4mE0jQp4kP5UYegy+lG9y1I6Vlsdz

In [62]:
s = bs(p.text, 'html.parser')
print(s.span)

<span class="Bump-link-symbol float-right text-normal text-gray-light">→</span>


In [74]:
print(s.find('span', attrs={'class': 'num text-emphasized', 'data-hovercard-type':"contributors"}))

None
