# Downloading github repositories 

### Notes

Links:
-  "https://github.com/search?p=1&q=extension%3Aipynb+nbformat_minor&type=Code&utf8=%E2%9C%93"

### Import

In [1]:
# imports
import requests
import re
import os
import json
import itertools

from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
from git import Repo

# import git    # add 'git' module through anaconda navigator + pip install gitpython

### Functions

In [22]:
def try_link_getter(ul_list):
    """
    The function takes a list of unordered lists (ul's) as input and returns a list 
    of all the list items (li) of the ul's that contain a href link 
    """
    links = []
    for ul in ul_list:
        for line in ul:
            try:
                links.append(line.a.get('href'))
            except:
                pass
    return links


def scan_page_not_found(repositories_list):
    """
    Ëxplanation
    """
    error_list = []
    for url in tqdm.tqdm(list(repositories_list)):
        r  = requests.get(url)
        data = r.text
        soup = BeautifulSoup(data,features="html.parser")
        try:
            if 'Page not found · GitHub' in soup.title.string:
                error_list.append(url)
        except:
            error_list.append(url)
    return error_list


def clone_repositories(repositories_list,error_list,folder_dir):
    """ 
    Explanaition 
    """
    cwd = os.getcwd()
    for url in tqdm_notebook(repositories_list):
        if url not in error_list:
            repo_name = url.split('/')[4]
            repo_dir = cwd+'\\'+folder_dir+'\\'+repo_name
            if not os.path.exists(folder_dir+'/'+repo_name): # skip already downloaded repo
                try:
                    Repo.clone_from(url,repo_dir)
                except:
                    print('Failed for:',repo_name,url)


def count_ipynb(folder):
    """
    Explanation
    """
    path = os.getcwd()+'\\'+folder
    try:
        dir_list = os.listdir(path)
    except:
        return "this repository doesn't exist"

    file_id = 0
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".ipynb"):
                file_id += 1
    return file_id

In [24]:
# clone_repos('test_master','https://github.com/JuliaLang/IJulia','repos')

## Webscraping <i>A gallery of interesting Jupyter Notebooks</i>

#### Webscraping van repositories flink verbeterd van 65 pakken naar 221!

In [25]:
url = "https://github.com/jupyter/jupyter/wiki/A-gallery-of-interesting-Jupyter-Notebooks"
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data)

In [26]:
body = soup.find_all("div", {"class": "markdown-body"})[0] # using [0] at the end since there is only 1 result of a div with class 'markdown-body'
all_links_in_body = set([link.get('href') for link in body.find_all('a')])
remaining = set()
len(all_links_in_body)#,all_links_in_body

740

#### Difference re.match and re.findall
re.match matches the pattern from the start of the string. re.findall however searches for occurrences of the pattern anywhere in the string.

In [27]:
r1 = re.compile(".*github.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$") # get all links the same as .....github.com/..../....$ 
r2 = re.compile(".*github.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+")  # get all links the same as .....github.com/..../....+ 
r3 = re.compile("github.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+")    # get all links the same as github.com/..../....+ 

github_repositories = set(filter(r1.match, all_links_in_body)) # There are 9 duplicates 112->103 correct url

all_github_links = set(filter(r2.match, all_links_in_body))
github_repo_remaining = all_github_links - github_repositories # the difference between a - b
# print(len(all_github_links),len(github_repositories),len(github_repo_remaining))

created_github_repos = [r3.findall(line) for line in github_repo_remaining] # list of list with all matches within a single url
created_github_repos = itertools.chain.from_iterable(created_github_repos) # flatten the list of lists to a list
created_github_repos = set(['https://'+url for url in created_github_repos]) # add 'https://' to created a valid url
# print(len(created_github_repos))


github_repositories.update(created_github_repos)
len(github_repositories)#,github_repositories

# this still includes false links like 'https://github.com/downloads/notebooks' 
# that originaly came from 'http://nbviewer.ipython.org/url/jakevdp.github.com/downloads/notebooks/XKCD_plots.ipynb'

127

code van stukje itertools - https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists

In [28]:
remaining = all_links_in_body - all_github_links
len(all_links_in_body),len(remaining)#,remaining

(740, 607)

In [29]:
r4 = re.compile(".*github.com/[a-zA-Z0-9_-]+$") # get all links the same as .....github.com/..../.... 

github_users = set(filter(r4.match, remaining)) # There are 9 duplicates 112->103
len(github_users)#,github_users

63

In [8]:
remaining = remaining - github_users

len(all_links_in_body),len(remaining)

(740, 544)

In [30]:
r5 = re.compile(".*/github/") # get all links the same as /github/ 
r6 = re.compile("github/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+")    # get all links the same as github/..../....+ 

# use the $ to end the match
github_links2 = set(filter(r5.match, remaining)) # There are 7 duplicates 202=>195
created_github_repos2 = set(['https://github.com'+r6.findall(line)[0][6:] for line in github_links2 if r6.findall(line)!=[]])


github_repositories.update(created_github_repos2)
len(github_links2),len(created_github_repos2)#,created_github_repos2

# from collections import Counter
# Counter(['https://github.com'+r6.findall(line)[0][6:] for line in ipynb_links if r6.findall(line)!=[]])
# multiple links to ipynb files from the same repository (therefore a different url but the same created url to the repo)

(166, 125)

# Result of scraping the gallery page
The body had a total of 740 unique links of which 441 were not usable to clone as repositories or download as file. For example a link within the gallery page itself, a link to an userprofile on github, a link to a webpage with explanation about notebooks but no actual ipynb file etc.

In [46]:
# The remaining links in the body that are not usable 
remaining = remaining - github_links2
print(len(remaining),'not usable links out of',len(all_links_in_body),'links in the body')

# test
# remaining

441 not usable links out of 740 links in the body


In [44]:
# The remaining links in the body that are not usable 
print(len(github_repositories),'github unique repository links out of',len(all_links_in_body),'links in the body')

# print(github_repositories)

221 github unique repository links out of 740 links in the body


In [50]:
r7 = re.compile(".*.ipynb") # get all links the same as ....ipynb.... 
remaining_ipynb_links = set(filter(r7.match, remaining))
print(len(remaining_ipynb_links))

# test
# remaining_ipynb_links

33


### Clone repositories from url's

In [13]:
# get pages with 'Page not found', the 
error_list = scan_page_not_found(github_repositories)

    
# if soup.title.string == 'Page not found · GitHub · GitHub': # misses the url https://github.com/yoavram/CS1001

HBox(children=(IntProgress(value=0, max=221), HTML(value='')))

9 https://github.com/blog/2012
173584 https://github.com/downloads/notebooks
173727 https://github.com/cfangmeier/Small
173557 https://github.com/JuliaLang/IJulia
173703 https://github.com/yoavram/CS1001
173509 https://github.com/raw/master
173751 https://github.com/Arn-O/py-gridmancer
173717 https://github.com/jakevdp/jakevdp
173530 https://github.com/tree/master
173731 https://github.com/kernc/backtesting
173842 https://github.com/GaelVaroquaux/nilearn_course
173770 https://github.com/carljv/cython_testing
173772 https://github.com/lgiordani/blog_source



In [None]:
# create folder in which to put all the repositories (if it does not exist)
if not os.path.exists('repos'):
    os.makedirs('repos')

In [5]:
s= "9 https://github.com/blog/2012 173584 https://github.com/downloads/notebooks 173727 https://github.com/cfangmeier/Small 173557 https://github.com/JuliaLang/IJulia 173703 https://github.com/yoavram/CS1001 173509 https://github.com/raw/master 173751 https://github.com/Arn-O/py-gridmancer 173717 https://github.com/jakevdp/jakevdp 173530 https://github.com/tree/master 173731 https://github.com/kernc/backtesting 173842 https://github.com/GaelVaroquaux/nilearn_course 173770 https://github.com/carljv/cython_testing 173772 https://github.com/lgiordani/blog_source"
error_list = [x for x in s.split() if len(x)>6]
error_list

['https://github.com/blog/2012',
 'https://github.com/downloads/notebooks',
 'https://github.com/cfangmeier/Small',
 'https://github.com/JuliaLang/IJulia',
 'https://github.com/yoavram/CS1001',
 'https://github.com/raw/master',
 'https://github.com/Arn-O/py-gridmancer',
 'https://github.com/jakevdp/jakevdp',
 'https://github.com/tree/master',
 'https://github.com/kernc/backtesting',
 'https://github.com/GaelVaroquaux/nilearn_course',
 'https://github.com/carljv/cython_testing',
 'https://github.com/lgiordani/blog_source']

In [51]:
# test
# clone_respositories(github_repositories,error_list,'repos')

In [36]:
# test
count_ipynb('repos')

3108