In [11]:
!pip install python-gitlab
# Docs: https://python-gitlab.readthedocs.io/en/stable/api-usage.html
!pip install beautifulsoup4
# Docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/



In [67]:
#Imports in one place to not cloud up work
import requests
import gitlab
from bs4 import BeautifulSoup
import functools
import re
import time

In [68]:
#https://gitlab.com/explore/projects/starred
# Get top n starred projects. 
# Gitlab API does not support this, so we must use web-scraping

# Caching to avoid rate limiting
@functools.cache
def get_starred_projects_html(page_num):
    return requests.get(f"https://gitlab.com/explore/projects/starred?sort=stars_desc&page={page_num}")

In [71]:
projects = []
for page in range(1, 50):  # 50 is the max page
    r = get_starred_projects_html(page)
    good_soup = BeautifulSoup(r.text, 'html.parser')  # https://www.youtube.com/watch?v=gkXzeZ0KE5Q
    projects.extend(good_soup.find_all("li", {"class": 'project-row'}))
    time.sleep(0.5)  # Dont get rate-limited by gitlab

In [79]:
# Turn div into project dict
non_number_re = re.compile('[^0-9]')
def extract_proj_details(proj):
    return {
        'name': proj.find('span', {'class': 'project-name'}).text,
        'namespace': proj.find('span', {'class': 'namespace-name'}).text[:-3].strip(),
        'url': proj.find('a').attrs['href'],
        # Following are possible to get from here but better to get from api as they are not always in html
#             'stars': int(non_number_re.sub('', proj.find('a', {'title': 'Stars'}).text)),
#             'forks': int(non_number_re.sub('', proj.find('a', {'title': 'Forks'}).text)),
    }
extract_proj_details(projects[0])

{'name': 'GitLab FOSS',
 'namespace': 'GitLab.org',
 'url': '/gitlab-org/gitlab-foss'}

In [80]:
top_projects = [extract_proj_details(proj) for proj in projects]
len(top_projects), top_projects[:5]

(400,
 [{'name': 'GitLab FOSS',
   'namespace': 'GitLab.org',
   'url': '/gitlab-org/gitlab-foss'},
  {'name': 'GitLab', 'namespace': 'GitLab.org', 'url': '/gitlab-org/gitlab'},
  {'name': 'inkscape', 'namespace': 'Inkscape', 'url': '/inkscape/inkscape'},
  {'name': 'OpenRGB',
   'namespace': 'Adam Honse',
   'url': '/CalcProgrammer1/OpenRGB'},
  {'name': 'gitlab-runner',
   'namespace': 'GitLab.org',
   'url': '/gitlab-org/gitlab-runner'}])

In [3]:
import gitlab

# anonymous read-only access for public resources (GitLab.com)
gl = gitlab.Gitlab()
gl

<gitlab.client.Gitlab at 0x22fa1359d60>

In [158]:
def extract_gl_data_from_project(proj):
    gl_proj = gl.projects.get(proj['url'][1:])
    proj_data = gl_proj.attributes
    groups = gl_proj.groups.list()
    proj_data['groups'] = [group.attributes for group in gl_proj.groups.list(all=True)]
    proj_data['languages'] = gl_proj.languages()
    proj_data['top_20_repository_contibutors'] = gl_proj.repository_contributors(per_page=20, order_by='commits', sort='desc')
    
    # get branch/commit/tag counts through requests and bs4 as its not in gitlab api
    proj_page_text = requests.get(f'https://gitlab.com{proj["url"]}').text
    good_soup = BeautifulSoup(proj_page_text, 'html.parser')
    soup_stats = good_soup.find('nav', {'class': 'project-stats'})
    soup_stats = [x.text for x in soup_stats.find_all('strong', {'class': 'project-stat-value'})]
    for stat_name, stat_value in zip(['commits', 'branches', 'tags', 'files', 'storage', 'releases'], soup_stats):
        proj_data[f'from_page_stats_{stat_name}'] = stat_value
    
    return proj_data

In [159]:
top_proj_data = [extract_gl_data_from_project(proj) for proj in top_projects[:50]]

In [160]:
len(top_proj_data), top_proj_data[-5:]

(50,
 [{'id': 7898047,
   'description': "Wireshark's official Git repository.",
   'name': 'wireshark',
   'name_with_namespace': 'Wireshark Foundation / wireshark',
   'path': 'wireshark',
   'path_with_namespace': 'wireshark/wireshark',
   'created_at': '2018-08-10T20:58:07.700Z',
   'default_branch': 'master',
   'tag_list': ['packet capture', 'protocol analysis', 'tshark', 'wireshark'],
   'topics': ['packet capture', 'protocol analysis', 'tshark', 'wireshark'],
   'ssh_url_to_repo': 'git@gitlab.com:wireshark/wireshark.git',
   'http_url_to_repo': 'https://gitlab.com/wireshark/wireshark.git',
   'web_url': 'https://gitlab.com/wireshark/wireshark',
   'readme_url': 'https://gitlab.com/wireshark/wireshark/-/blob/master/README.md',
   'avatar_url': 'https://gitlab.com/uploads/-/system/project/avatar/7898047/wsicon180.png',
   'forks_count': 551,
   'star_count': 453,
   'last_activity_at': '2022-03-18T17:24:49.215Z',
   'namespace': {'id': 3421856,
    'name': 'Wireshark Foundation',