### Use the requests library to download webpages

In [1]:
import requests

In [2]:
topics_url = 'https://github.com/topics'

In [3]:
response = requests.get(topics_url)

In [4]:
response.status_code  #Indicates whether response was successful or not

200

In [5]:
len(response.text)

166232

In [6]:
page_contents = response.text
page_contents[:1000]

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"  data-a11y-animated-images="system" data-a11y-link-underlines="true">\n\n\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n  \n\n  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-b92e9647318f.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-5d486a4ede8e.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" med

### Use Beautiful Soup to parse and extract information

In [7]:
from bs4 import BeautifulSoup
doc = BeautifulSoup(page_contents,'html.parser')

In [8]:
selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = doc.find_all('p',{'class':selection_class})
topic_title_tags

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Angular</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ansible</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">API</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Arduino</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">ASP.NET</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Atom</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Awesome Lists</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amazon Web Services</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Azure</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Babel</p>,
 <p class="f3 lh-condensed m

In [9]:
topic_title_tags[:5]

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]

In [10]:
selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = doc.find_all('p',{'class':selection_class})
desc_selector = 'f5 color-fg-muted mb-0 mt-1'
topic_desc_tags = doc.find_all('p',{'class': desc_selector})

In [11]:
topic_desc_tags[:5]

[<p class="f5 color-fg-muted mb-0 mt-1">
           3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ajax is a technique for creating interactive web applications.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Algorithms are self-contained sequences that carry out a variety of tasks.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Amp is a non-blocking concurrency library for PHP.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Android is an operating system built by Google designed for mobile devices.
         </p>]

In [12]:

topic_link_tags = doc.find_all('a',{'class':'no-underline flex-grow-0'})
len(topic_link_tags)

30

In [13]:
topic0_url = "https://github.com/" + topic_link_tags[0]['href']
topic0_url

'https://github.com//topics/3d'

In [14]:
topic_titles = [tag.text for tag in topic_title_tags]
topic_titles

['3D',
 'Ajax',
 'Algorithm',
 'Amp',
 'Android',
 'Angular',
 'Ansible',
 'API',
 'Arduino',
 'ASP.NET',
 'Atom',
 'Awesome Lists',
 'Amazon Web Services',
 'Azure',
 'Babel',
 'Bash',
 'Bitcoin',
 'Bootstrap',
 'Bot',
 'C',
 'Chrome',
 'Chrome extension',
 'Command line interface',
 'Clojure',
 'Code quality',
 'Code review',
 'Compiler',
 'Continuous integration',
 'COVID-19',
 'C++']

In [15]:
topic_descs = [tag.text.strip() for tag in topic_desc_tags ]
topic_descs[:5]

['3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency library for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [16]:
base_url = 'https://github.com/'
topic_url = [base_url+tag['href'] for tag in topic_link_tags]
topics_url

'https://github.com/topics'

In [17]:
import pandas as pd
topics_dict = {'title':topic_titles,'description':topic_descs,'url':topic_url}

In [18]:
topics_df = pd.DataFrame(topics_dict)
topics_df

Unnamed: 0,title,description,url
0,3D,3D refers to the use of three-dimensional grap...,https://github.com//topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com//topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com//topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com//topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com//topics/android
5,Angular,Angular is an open source web application plat...,https://github.com//topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com//topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com//topics/api
8,Arduino,Arduino is an open source platform for buildin...,https://github.com//topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com//topics/aspnet


## Create CSV files with extracted information

In [19]:
topics_df.to_csv('topics.csv',index = None)

## Getting information out of a topic page

In [20]:
topic_page_url = topic_url[0]
topic_page_url

'https://github.com//topics/3d'

In [21]:
response = requests.get(topic_page_url)

In [22]:
topic_doc = BeautifulSoup(response.text,'html.parser')

In [23]:
h3_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
repo_tags = topic_doc.find_all('h3',{'class': h3_selection_class})

In [24]:
a_tags = repo_tags[0].find_all('a')
a_tags[0].text.strip()

'mrdoob'

In [25]:
a_tags[1].text.strip()

'three.js'

In [26]:
base_url = 'https://github.com/'
repo_url = base_url + a_tags[1]['href']
repo_url

'https://github.com//mrdoob/three.js'

In [27]:

star_tags = topic_doc.find_all('span',{'class':'Counter js-social-count'})
star_tags[0].text

'95.1k'

In [28]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1])*1000)
parse_star_count(star_tags[0].text.strip())

95100

In [29]:
def get_repo_info(h3_tag,star_tag):
    #returns all info for repo 
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip() 
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username,repo_name,stars,repo_url

In [30]:
get_repo_info(repo_tags[0],star_tags[0])

('mrdoob', 'three.js', 95100, 'https://github.com//mrdoob/three.js')

In [31]:
topic_repos_dict = {'username':[],'repo_name':[],'stars':[],'repo_url':[]}
for i in range(len(repo_tags)):
    repo_info = get_repo_info(repo_tags[i],star_tags[i])
    topic_repos_dict['username'].append(repo_info[0])
    topic_repos_dict['repo_name'].append(repo_info[1])
    topic_repos_dict['stars'].append(repo_info[2])
    topic_repos_dict['repo_url'].append(repo_info[3])

# Final Code

In [59]:
import os

def get_topic_page(topic_url):
    #Download the page
    response = requests.get(topic_url)
    if response.status_code != 200:
        raise Exception('Failed to load page{}'.format(topic_url))
    #Parse using bs
    topic_doc = BeautifulSoup(response.text,'html.parser')
    return topic_doc

def get_repo_info(h3_tag,star_tag):
    #returns all info for repo 
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip() 
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username,repo_name,stars,repo_url

def get_topic_repos(topic_doc):
    
    #Get h3 tags containing repo url,title,username
    h3_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3',{'class': h3_selection_class})
    #Get star tags,like
    star_tags = topic_doc.find_all('span',{'class':'Counter js-social-count'})
    topic_repos_dict = {'username':[],'repo_name':[],'stars':[],'repo_url':[]}
    #Get repo info
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i],star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
    return pd.DataFrame(topic_repos_dict,index=None)

def scrape_topic(topic_url,path):
    # fname = topic_name + '.csv'
    if os.path.exists(path):
        print("The file {} already exists.Skipping....".format(path))
        return
    topics_df = get_topic_repos(get_topic_page(topic_url))
    topics_df.to_csv(path,index = None)

In [33]:
url4 = topic_url[4]

In [34]:
topic4_doc = get_topic_page(url4)
topic4_repos = get_topic_repos(topic4_doc)
topic4_repos

Unnamed: 0,username,repo_name,stars,repo_url
0,flutter,flutter,157000,https://github.com//flutter/flutter
1,facebook,react-native,112000,https://github.com//facebook/react-native
2,justjavac,free-programming-books-zh_CN,105000,https://github.com//justjavac/free-programming...
3,Genymobile,scrcpy,92500,https://github.com//Genymobile/scrcpy
4,Hack-with-Github,Awesome-Hacking,70100,https://github.com//Hack-with-Github/Awesome-H...
5,Solido,awesome-flutter,48800,https://github.com//Solido/awesome-flutter
6,google,material-design-icons,48800,https://github.com//google/material-design-icons
7,wasabeef,awesome-android-ui,47600,https://github.com//wasabeef/awesome-android-ui
8,square,okhttp,44600,https://github.com//square/okhttp
9,android,architecture-samples,43200,https://github.com//android/architecture-samples


In [35]:
get_topic_repos(get_topic_page(topic_url[5]))

Unnamed: 0,username,repo_name,stars,repo_url
0,justjavac,free-programming-books-zh_CN,105000,https://github.com//justjavac/free-programming...
1,angular,angular,90800,https://github.com//angular/angular
2,storybookjs,storybook,80700,https://github.com//storybookjs/storybook
3,leonardomso,33-js-concepts,59300,https://github.com//leonardomso/33-js-concepts
4,ionic-team,ionic-framework,49500,https://github.com//ionic-team/ionic-framework
5,prettier,prettier,46800,https://github.com//prettier/prettier
6,Asabeneh,30-Days-Of-JavaScript,38900,https://github.com//Asabeneh/30-Days-Of-JavaSc...
7,SheetJS,sheetjs,33700,https://github.com//SheetJS/sheetjs
8,angular,angular-cli,26200,https://github.com//angular/angular-cli
9,angular,components,23700,https://github.com//angular/components


# Write a single function to :

- 1.Get the list of topics from the topics page
- 2.Get the list of top repos from the individual topic pages
- 3.For each topic, create a CSV of the top repos for the topic

In [60]:
def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p',{'class':selection_class})
    topic_titles = [tag.text for tag in topic_title_tags]
    return topic_titles
    
def get_topic_desc(doc):
    desc_selector = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p',{'class': desc_selector})
    topic_descs = [tag.text.strip() for tag in topic_desc_tags ]
    return topic_descs

def get_topic_url(doc):
    topic_link_tags = doc.find_all('a',{'class':'no-underline flex-grow-0'})
    topic_url = [base_url+tag['href'] for tag in topic_link_tags]
    return topic_url

def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page{}'.format(topics_url))
    topics_dict = {
        'title': get_topic_titles(doc),'description': get_topic_desc(doc),
        'url':get_topic_url(doc)
    }
    return pd.DataFrame(topics_dict)

In [61]:
def scrape_topics_repos():
    print("Scraping list of topics")
    topics_df = scrape_topics()

    os.makedirs('data',exist_ok= True)
    for index,row in topics_df.iterrows():
        print("Scraping top repositories for {}".format(row['title']))
        scrape_topic(row['url'],'data/{}.csv'.format(row['title']) )

In [62]:
scrape_topics_repos()

Scraping list of topics
Scraping top repositories for 3D
Scraping top repositories for Ajax
Scraping top repositories for Algorithm
Scraping top repositories for Amp
Scraping top repositories for Android
Scraping top repositories for Angular
Scraping top repositories for Ansible
Scraping top repositories for API
Scraping top repositories for Arduino
Scraping top repositories for ASP.NET
Scraping top repositories for Atom
Scraping top repositories for Awesome Lists
Scraping top repositories for Amazon Web Services
Scraping top repositories for Azure
Scraping top repositories for Babel
Scraping top repositories for Bash
Scraping top repositories for Bitcoin
Scraping top repositories for Bootstrap
Scraping top repositories for Bot
Scraping top repositories for C
Scraping top repositories for Chrome
Scraping top repositories for Chrome extension
Scraping top repositories for Command line interface
Scraping top repositories for Clojure
Scraping top repositories for Code quality
Scraping top