# Top repositories for github topics


### import library 

In [103]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

### Get a list of topics ( title, description, url)

In [104]:
#  parse information from the web
def get_topics_page():
    topics_url = 'https://github.com/topics'
    res = requests.get(topics_url)
    # load page fail
    if res.status_code != 200:
        raise Exception('Failed to load page {}'.format(topics_url))   
    # get resource code
    page_contents = res.text
    doc = BeautifulSoup(page_contents, 'html.parser')
    return doc

In [105]:
doc = get_topics_page()

In [106]:
# get the title of topics
def get_topics_title(doc):
    #read title of topics
    topics_title_tags = doc.find_all('p',class_="f3 lh-condensed mb-0 mt-1 Link--primary")
    # create list of titte
    topics_title = []
    for title in topics_title_tags:
        topics_title.append(title.text)
    return topics_title

In [107]:
topics_title = get_topics_title(doc)
topics_title[:5]

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android']

In [108]:
# get the description of topics
def get_desc_topics(doc):
    #read description of topics
    topics_desc_tags = doc.find_all('p',class_="f5 color-text-secondary mb-0 mt-1")
    # create list of desc
    topics_desc = []
    for desc in topics_desc_tags:
        topics_desc.append(desc.text.strip())
    return topics_desc

In [109]:
topics_desc = get_desc_topics(doc)
topics_desc[:5]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency framework for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [110]:
def get_url_topics(doc):
     #read url of topics
    topics_url_tags = doc.find_all('a',class_="d-flex no-underline")
    # create list of url
    topics_url = []
    base_url = 'https://github.com/'
    for url in topics_url_tags:
        url = base_url + url['href']
        topics_url.append(url)
    return topics_url

In [111]:
topics_url = get_url_topics(doc)
topics_url[:5]

['https://github.com//topics/3d',
 'https://github.com//topics/ajax',
 'https://github.com//topics/algorithm',
 'https://github.com//topics/amphp',
 'https://github.com//topics/android']

In [112]:
def get_topics(doc):        
    topics_dict = {
        'title': get_topics_title(doc),
        'description': get_desc_topics(doc),
        'url': get_url_topics(doc)
    }
    return pd.DataFrame(topics_dict)

In [113]:
topics_df = get_topics(get_topics_page())
topics_df.head()

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com//topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com//topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com//topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com//topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com//topics/android


In [114]:
def scrape_topics(): 
    topics_df = get_topics(get_topics_page())
    
    # create csv file
    print('Scraping list of topics')
    topics_df.to_csv('data/{}.csv'.format('topics'), index= None)

### Get repositories from a topic page

In [115]:
#  parse information from the web
def get_topic_page(topic_url):
    res = requests.get(topic_url)
    # load page fail
    if res.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))   
    # get resource code
    page_contents = res.text
    doc = BeautifulSoup(page_contents, 'html.parser')
    return doc

In [116]:
# run example
doc = get_topic_page('https://github.com/topics/3d')

In [117]:
# convert to float
def parseToFloat(star):
    star_str = star.strip()
    if(star_str[-1] == 'k'):
        return int(float(star_str[:-1]) * 1000)   

In [118]:
# get infomation of repo (username, repo_name, stars, repo_url)
def get_repo_infor(h3_tag, star):
    base_url = 'https://github.com/'
    topic_repo_a = h3_tag.find_all('a')
    username = topic_repo_a[0].text.strip()
    repo_name = topic_repo_a[1].text.strip()
    stars = base_url + topic_repo_a[1]['href']
    repo_url = parseToFloat(star.text.strip())
    return username, repo_name, repo_url, stars

In [119]:
# get repositories of a topic
def get_topic_repos(doc):
    # Get the h1 tags containing repo title, repo URL and username
    topic_repo_tags = doc.find_all('h3', class_="f3 color-text-secondary text-normal lh-condensed")
    # Get star tags
    topic_repo_star = doc.find_all(class_="social-count float-none")

    topic_repos_dic = {
        'username':[],
        'repo_name': [],
        'stars': [],
        'repo_url': []
    }
    # Get repo info
    for i in range(len(topic_repo_tags) ):
        repo_infor = get_repo_infor(topic_repo_tags[i], topic_repo_star[i])
        topic_repos_dic['username'].append(repo_infor[0])
        topic_repos_dic['repo_name'].append(repo_infor[1])
        topic_repos_dic['repo_url'].append(repo_infor[2])
        topic_repos_dic['stars'].append(repo_infor[3])
    return pd.DataFrame(topic_repos_dic)

In [120]:
# example
topic_url = topics_df['url'][0]
doc = get_topic_page(topic_url)
topic_repos_df = get_topic_repos(doc)    
topic_repos_df.head()

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,https://github.com//mrdoob/three.js,74900
1,libgdx,libgdx,https://github.com//libgdx/libgdx,19100
2,pmndrs,react-three-fiber,https://github.com//pmndrs/react-three-fiber,15300
3,BabylonJS,Babylon.js,https://github.com//BabylonJS/Babylon.js,15000
4,aframevr,aframe,https://github.com//aframevr/aframe,13100


In [121]:
# scrape repositories of a topic
def scrap_repo(topic_url, topic_title):
    fileName = '{}.csv'.format(topic_title)
    if os.path.exists(fileName):
        print("The file {} already exists. Skipping...".format(fileName))
        return 
    doc = get_topic_page(topic_url)
    topic_repos_df = get_topic_repos(doc)
    topic_repos_df.to_csv(fileName, index=None)

In [122]:

# scape repositories of the topics
def scrap_repo_topics(topics_df):
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['title']))
        scrap_repo(row['url'], 'data/{}.csv'.format(row['title']))


In [123]:
os.makedirs('data', exist_ok=True)

scrape_topics()

scrap_repo_topics(topics_df)

Scraping list of topics
Scraping top repositories for "3D"
The file data/3D.csv.csv already exists. Skipping...
Scraping top repositories for "Ajax"
The file data/Ajax.csv.csv already exists. Skipping...
Scraping top repositories for "Algorithm"
The file data/Algorithm.csv.csv already exists. Skipping...
Scraping top repositories for "Amp"
The file data/Amp.csv.csv already exists. Skipping...
Scraping top repositories for "Android"
The file data/Android.csv.csv already exists. Skipping...
Scraping top repositories for "Angular"
The file data/Angular.csv.csv already exists. Skipping...
Scraping top repositories for "Ansible"
The file data/Ansible.csv.csv already exists. Skipping...
Scraping top repositories for "API"
The file data/API.csv.csv already exists. Skipping...
Scraping top repositories for "Arduino"
The file data/Arduino.csv.csv already exists. Skipping...
Scraping top repositories for "ASP.NET"
The file data/ASP.NET.csv.csv already exists. Skipping...
Scraping top repositorie