Outline:
- We're going to scrape https://github.com/topics
- We'll get a list of topics. For each topic, we'll get topic title, topic page URL and topic description
- For each topic, we'll get the top 25 repositories in the topic from the topic page
- For each repository, we'll grab the repo name, username, stars and repo URL
- For each topic we'll create a CSV file in the following format:
- Repo Name,Username,Stars,Repo URL
- three.js,mrdoob,69700,https://github.com/mrdoob/three.js
- libgdx,libgdx,18300,https://github.com/libgdx/libgdx

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

base_url = 'https://github.com'
topics_url = 'https://github.com/topics'
response = requests.get(topics_url)
html = response.text

with open('web.html','w') as file:
    file.write(html)

# if you wanted to make csv of just the topic titles, descriptions, and topic links
# dic = {'title':topic_titles,'description':topic_descriptions,'links':topic_links}
# topic_df = pd.DataFrame(dic)
# topic_df.to_csv('topics.csv',index=None)

In [2]:
def scrape_topics_repos():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    html = response.text
    
    topic_titles,topic_descriptions,topic_links = get_topic_info(topics_url)
    
    repos_dfs = [get_topic_repo(topic_link) for topic_link in topic_links]
    
    os.makedirs('data',exist_ok=True)
    
    for i,repos_df in enumerate(repos_dfs):
        repos_df.to_csv('data/' + topic_titles[i] + '.csv',index=None)


In [3]:
def get_topic_info(topics_url):
    response = requests.get(topics_url)
    html = response.text
    doc = BeautifulSoup(html,'html.parser')
    topic_title_tags = doc.find_all('p',{'class':'f3 lh-condensed mb-0 mt-1 Link--primary'})
    topic_description_tags = doc.find_all('p',{'class':'f5 color-fg-muted mb-0 mt-1'})
    topic_link_tags = doc.find_all('a',{'class':'d-flex no-underline'})

    topic_titles = [topic_tag.text for topic_tag in topic_title_tags]
    topic_descriptions = [description_tag.text.strip() for description_tag in topic_description_tags]
    topic_links = [base_url + link_tag['href'] for link_tag in topic_link_tags]
    
    return (topic_titles,topic_descriptions,topic_links)

In [4]:
def parse_star_count(x):
    x = x.strip()
    if x[-1] == 'k':
        num_string = x[:len(x)-1]
        return int(float(num_string) * 1000)
    else:
        return int(x)

In [5]:
def get_repo_info(repo_tags,star_tags):
    a_tags = [repo_tag.find_all('a') for repo_tag  in repo_tags]
    usernames = [a_tag[0].text.strip() for a_tag in a_tags]
    repo_names = [a_tag[1].text.strip() for a_tag in a_tags]
    repo_links = [base_url + a_tag[1]['href'] for a_tag in a_tags]
    stars = [parse_star_count(star_tag.text) for star_tag in star_tags]
    
    return (usernames,repo_names,repo_links,stars)
    

In [6]:
def get_topic_repo(topic_link):
    response = requests.get(topic_link)
    html = response.text
    topic_doc = BeautifulSoup(html,'html.parser')
    repo_tags = topic_doc.find_all('h3',{'class':'f3 color-fg-muted text-normal lh-condensed'})
    star_tags = topic_doc.find_all('a',{'class':'social-count js-social-count'})
    usernames,repo_names,repo_links,stars = get_repo_info(repo_tags,star_tags)

    topic_repos_dict = {}

    topic_repos_dict['username'] = usernames
    topic_repos_dict['repo_name'] = repo_names
    topic_repos_dict['stars'] = stars 
    topic_repos_dict['repo_link'] = repo_links

    # the repository dataframe of 30 repositories for each of the 30 topics
    # originally in the 30 topics dataframe
    repo_df = pd.DataFrame(topic_repos_dict)
    
    return repo_df

In [7]:
scrape_topics_repos()