In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width: 100% !important; }</style>"))

# Top Repositories for Github Topics

## Project Outline :
- In this notebook, i'll be scraping "https://github.com/topics"
- I'll the list of topics. For each topic, I'll get topic title, topic page URL and topic description
- For each topic ill get the top 25 repositories in the topic from the topic page
- For each repository, I'll grab the repo name , username, stars and repo URL
- For each topic, I'll create a CSV file

## Importing Libraries

In [6]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import os
from bs4 import BeautifulSoup

In [3]:
#!pip install requests --upgrade --quiet
import requests

## Scrape the list of topics from Github

- Use requests to downlaod a page
- Use BS4 to parse and extract the information
- Convert to Pandas dataframe

Let's write a fuction to download the page

In [94]:
def get_topics_page():
    '''
    This will get the url and provide the document of topics
    
    '''
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception(f'Failed to load the page {topics_url}')
    doc = BeautifulSoup(response.text, 'html.parser')
    return doc 
doc = get_topics_page()
doc.find_all('a')[:2]

[<a class="px-2 py-4 color-bg-info-inverse color-text-white show-on-focus js-skip-to-content" href="#start-of-content">Skip to content</a>,
 <a aria-label="Homepage" class="mr-4" data-ga-click="(Logged out) Header, go to homepage, icon:logo-wordmark" href="https://github.com/">
 <svg aria-hidden="true" class="octicon octicon-mark-github color-text-white" height="32" version="1.1" viewbox="0 0 16 16" width="32"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z" fill-ru

Let's create some helper function to parse the information form the page

In [95]:
def get_topic_title(doc):
    '''
    This function will be used to get the list of titles
    
    '''
    selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
    topic_title_tags = doc.find_all("p" , {'class': selection_class})
    topic_titles = [ tag.text for tag in topic_title_tags]
    return topic_titles

In [96]:
titles = get_topics_titles(doc)
print(len(titles))
titles[:3]

30


['3D', 'Ajax', 'Algorithm']

In [97]:
def get_topic_description(doc):
    '''
    This function will be used to get the topic description
    
    '''
    topic_desc_tags = doc.find_all('p', {'class' :'f5 color-text-secondary mb-0 mt-1'})
    topic_descs = [topic_desc.text.strip() for topic_desc in topic_desc_tags]
    return topic_descs

In [98]:
topic_desc = get_topics_description(doc)
print(len(topic_desc))
topic_desc[:3]    

30


['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.']

In [99]:
def get_topic_url(doc):
    '''
    This function will be used to get the topic url
    
    '''
    topic_link_tag = doc.find_all('a', {'class': 'd-flex no-underline'})
    topic_urls = ['https://github.com'+tage['href']for tage in topic_link_tag]
    return topic_urls

In [100]:
topic_url = get_topics_url(doc)
print(len(topic_url))
topic_url[:3]

30


['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm']

**Now let's put it all together into a single function**

In [101]:
def scrape_topics():
    '''
    This function will get the topics from the topics page and for each topic, it is going to get the topic name, description and url.
    After that it will put it all togather in a Pandas dataframe and return the dataframe evantually
    
    '''
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception(f'Failed to load the page {topics_url}')
    
    doc = BeautifulSoup(response.text, 'html.parser')
    
    topics_dict = {
        'title' : get_topic_title(doc),
        'description' : get_topic_description(doc),
        'url' : get_topic_url(doc)
    }
    return pd.DataFrame(topics_dict)

## Get the top repositories from the topic page

- Now we have a dataframe having topic title, description and url in it
- So, let's create a function that will iterate over the dataframe and will get the top repos for each topic in the dataframe

In [102]:
def get_topic_page(topic_url):
    '''
    This function will get the topic url, download the HTML content of that topic page,
    parse it to the BeautifulSoup and return the returns the formated page with data type of "bs4.BeautifulSoup"
    
    ''' 
    response = requests.get(topic_url)
    #Check Successful response
    if response.status_code != 200:
        raise Exception(f'Failed to load the page {topic_url}')
    #Paerse using beautiful soup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

In [108]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    return int(float(stars_str[:-1])*1000) if stars_str[-1] == 'k' else int(stars_str)

In [109]:
def get_repo_info(h3_tag, star_tags ):
    '''
    This function will get the username(for each repo) which is available in H3 tage
    It will also get the star rating for each repo as well
    
    '''
    base_url = 'https://github.com'
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tags.text.strip())
    return username, repo_name , stars , repo_url

In [110]:
def get_topic_repos(topic_doc):
    '''
    Get h3 tags containing repo title, repo url and username. It will return a dataframe
    containing, User Name, Repo Name, Stars and Repo URL for each title 
    
    '''
    repo_tags = topic_doc.find_all('h3', {'class' : 'f3 color-text-secondary text-normal lh-condensed'})
    #Get star tags
    star_tags = topic_doc.find_all('a' , {'class' : 'social-count float-none'})
    
    topics_repo_dict = {
    'User Name' : list(),
    'Repo Name' : list(),
    'Stars'     : list(),
    'Repo URL'  : list()
    }
    
    #Get repo info
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topics_repo_dict['User Name'].append(repo_info[0])
        topics_repo_dict['Repo Name'].append(repo_info[1])
        topics_repo_dict['Stars'].append(repo_info[2])
        topics_repo_dict['Repo URL'].append(repo_info[3])
    return pd.DataFrame(topics_repo_dict)

In [111]:
def scrape_topic(topic_url, path):

    if os.path.exists(path):
        print(f'The file {path} already exists, skipping...')
    topic_df = get_topic_repos(get_topic_page(topic_url))
    return topic_df.to_csv(path , index = None)

In [112]:
def scrape_topics_repos():
    '''
    This function is a complete function which will get the topic url and name of the directory to save the data in.
    Then it is going to get the dataframe of all the topics after that, it will search the best repos for each topic
    in the dataframe. And eventually it is going to save the dataframe for each topic in seperate csv file. 

    '''
    
    print('Scraping list of topics')
    topics_df = scrape_topics()
    os.makedirs('data', exist_ok=True)
    
    for index, row in topics_df.iterrows():
        print(f"Scarping top repos for {row['title']}")
        scrape_topic(row['url'], f"data/{row['title']}.csv")

In [113]:
scrape_topics_repos()

Scraping list of topics
Scarping top repos for 3D
Scarping top repos for Ajax
Scarping top repos for Algorithm
Scarping top repos for Amp
Scarping top repos for Android
Scarping top repos for Angular
Scarping top repos for Ansible
Scarping top repos for API
Scarping top repos for Arduino
Scarping top repos for ASP.NET
Scarping top repos for Atom
Scarping top repos for Awesome Lists
Scarping top repos for Amazon Web Services
Scarping top repos for Azure
Scarping top repos for Babel
Scarping top repos for Bash
Scarping top repos for Bitcoin
Scarping top repos for Bootstrap
Scarping top repos for Bot
Scarping top repos for C
Scarping top repos for Chrome
Scarping top repos for Chrome extension
Scarping top repos for Command line interface
Scarping top repos for Clojure
Scarping top repos for Code quality
Scarping top repos for Code review
Scarping top repos for Compiler
Scarping top repos for Continuous integration
Scarping top repos for COVID-19
Scarping top repos for C++
