In [204]:
!pip install python-gitlab
# Docs: https://python-gitlab.readthedocs.io/en/stable/api-usage.html
!pip install beautifulsoup4
# Docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/



In [205]:
#Imports in one place to not cloud up work
import requests
import gitlab
from bs4 import BeautifulSoup
import functools
import re
import time
import random
import json
import concurrent.futures

In [206]:
#https://gitlab.com/explore/projects/starred
# Get top n starred projects. 
# Gitlab API does not support this, so we must use web-scraping

# Caching to avoid rate limiting
@functools.cache
def get_https_text(url):
    return requests.get(url).text

def get_starred_projects_html(page_num):
    return get_https_text(f"https://gitlab.com/explore/projects/starred?sort=stars_desc&page={page_num}")

In [71]:
projects = []
for page in range(1, 50):  # 50 is the max page
    r = get_starred_projects_html(page)
    good_soup = BeautifulSoup(r, 'html.parser')  # https://www.youtube.com/watch?v=gkXzeZ0KE5Q
    projects.extend(good_soup.find_all("li", {"class": 'project-row'}))
    time.sleep(0.5)  # Dont get rate-limited by gitlab

In [79]:
# Turn div into project dict
non_number_re = re.compile('[^0-9]')
def extract_proj_details(proj):
    return {
        'name': proj.find('span', {'class': 'project-name'}).text,
        'namespace': proj.find('span', {'class': 'namespace-name'}).text[:-3].strip(),
        'url': proj.find('a').attrs['href'],
        # Following are possible to get from here but better to get from api as they are not always in html
#             'stars': int(non_number_re.sub('', proj.find('a', {'title': 'Stars'}).text)),
#             'forks': int(non_number_re.sub('', proj.find('a', {'title': 'Forks'}).text)),
    }
extract_proj_details(projects[0])

{'name': 'GitLab FOSS',
 'namespace': 'GitLab.org',
 'url': '/gitlab-org/gitlab-foss'}

In [80]:
top_projects = [extract_proj_details(proj) for proj in projects]
len(top_projects), top_projects[:5]

(400,
 [{'name': 'GitLab FOSS',
   'namespace': 'GitLab.org',
   'url': '/gitlab-org/gitlab-foss'},
  {'name': 'GitLab', 'namespace': 'GitLab.org', 'url': '/gitlab-org/gitlab'},
  {'name': 'inkscape', 'namespace': 'Inkscape', 'url': '/inkscape/inkscape'},
  {'name': 'OpenRGB',
   'namespace': 'Adam Honse',
   'url': '/CalcProgrammer1/OpenRGB'},
  {'name': 'gitlab-runner',
   'namespace': 'GitLab.org',
   'url': '/gitlab-org/gitlab-runner'}])

In [211]:
def extract_gl_data_from_project(proj):
    gl = gitlab.Gitlab()
    time.sleep(random.randint(1, 10))  # Give gitlab a little break
    access_time = time.time()
    gl_proj = gl.projects.get(proj['url'][1:])
    proj_data = gl_proj.attributes
    proj_data['access_time'] = access_time
    groups = gl_proj.groups.list()
    # Sometimes get an authentication error depending on project, easier to except all instead of the right error
    try:
        proj_data['groups'] = [group.attributes for group in gl_proj.groups.list(all=True)]
    except:
        pass
    try:
        proj_data['languages'] = gl_proj.languages()
    except:
        pass
    try:
        proj_data['issue_stats'] = gl_proj.issues_statistics.get().attributes['statistics']
    except:
        pass
    try:
        proj_data['top_20_repository_contibutors'] = gl_proj.repository_contributors(per_page=20, order_by='commits', sort='desc')
    except:
        pass
    
    # get branch/commit/tag counts through requests and bs4 as its not in gitlab api
    proj_page_text = get_https_text(f'https://gitlab.com{proj["url"]}')
    good_soup = BeautifulSoup(proj_page_text, 'html.parser')
    soup_stats = good_soup.find('nav', {'class': 'project-stats'})
    if soup_stats is not None:
        soup_stats = [x.text for x in soup_stats.find_all('strong', {'class': 'project-stat-value'})]
        for stat_name, stat_value in zip(['commits', 'branches', 'tags', 'files', 'storage', 'releases'], soup_stats):
            proj_data[f'from_page_stats_{stat_name}'] = stat_value
        
    # Get MR stats through requests & bs4 as its not in api
    mr_page_text = get_https_text(f'https://gitlab.com{proj["url"]}/-/merge_requests')
    good_soup = BeautifulSoup(mr_page_text, 'html.parser')
    for mr_state in ['opened', 'merged', 'closed', 'all']:
        if state_data := good_soup.find('a', {'id': f'state-{mr_state}'}):
            proj_data[f'num_merge_requests_{mr_state}'] = int(non_number_re.sub('', state_data.find_all('span')[-1].text))

    # Save raw data too
    proj_data['html'] = {
        'main': {
            'url': f'https://gitlab.com{proj["url"]}',
            'html': proj_page_text,
        },
        'merge_requests': {
            'url': f'https://gitlab.com{proj["url"]}/-/merge_requests',
            'html': mr_page_text,
        }
    }

    return proj_data

In [212]:
# Thread and not Process because network bound not CPU bound
with concurrent.futures.ThreadPoolExecutor() as ex:
    all_data = list(ex.map(extract_gl_data_from_project, top_projects))

In [213]:
len(all_data), all_data[-5:]

(400,
 [{'id': 20975946,
   'description': 'Découvrez les tutoriels ansible en français ici : https://youtu.be/kzmvwc2q_z0',
   'name': 'Tutoriels Ansible',
   'name_with_namespace': 'Xavier / Tutoriels Ansible',
   'path': 'presentation-ansible-fr',
   'path_with_namespace': 'xavki/presentation-ansible-fr',
   'created_at': '2020-09-07T08:03:28.393Z',
   'default_branch': 'master',
   'tag_list': ['ansible', 'docker'],
   'topics': ['ansible', 'docker'],
   'ssh_url_to_repo': 'git@gitlab.com:xavki/presentation-ansible-fr.git',
   'http_url_to_repo': 'https://gitlab.com/xavki/presentation-ansible-fr.git',
   'web_url': 'https://gitlab.com/xavki/presentation-ansible-fr',
   'readme_url': 'https://gitlab.com/xavki/presentation-ansible-fr/-/blob/master/README.md',
   'avatar_url': 'https://gitlab.com/uploads/-/system/project/avatar/20975946/Ansible_logo-700x700.png',
   'forks_count': 32,
   'star_count': 43,
   'last_activity_at': '2022-01-14T16:44:29.733Z',
   'namespace': {'id': 390894

In [214]:
with open('gitlab.json', 'w') as f:
    json.dump(all_data, f)