# WEB - SCRAPING - GITHUB

In [28]:
import os
import requests
import pandas as pd

In [11]:
topic_url = 'https://github.com/topics'
print(topic_url)

https://github.com/topics


In [12]:
response = requests.get(topic_url)
response.status_code

200

In [14]:
page_contents = response.text
page_contents[:1000]

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"  data-a11y-animated-images="system" data-a11y-link-underlines="true">\n\n\n\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n  \n\n  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-b92e9647318f.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-5d486a4ede8e.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" m

In [15]:
with open('webpage.html','w',encoding = 'utf-8') as f:
    f.write(page_contents)

In [16]:
from bs4 import BeautifulSoup

In [17]:
doc = BeautifulSoup(page_contents,'html.parser')

In [18]:
topic_title_tags = doc.find_all('p')

In [19]:
select_class="f3 lh-condensed mb-0 mt-1 Link--primary"
topic_title_tags = doc.find_all('p',{'class': select_class})

In [21]:
desc_selector = 'f5 color-fg-muted mb-0 mt-1'
topic_desc_tags = doc.find_all('p',{'class': desc_selector})

In [22]:
topic_link_tags = doc.find_all('a' ,{'class':'no-underline flex-1 d-flex flex-column'})

In [23]:
topic0_url = "https://github.com" + topic_link_tags[0]['href']
print(topic0_url)

https://github.com/topics/3d


In [29]:
def get_topic_page(topic_url):
    #Download the page
    response = requests.get(topic_url)
    #Check successful response
    if response.status_code != 200:
        raise Exception('Faild to load page {}'.format(topic_url))
    # parse using BeautifulSoup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    #Get the h3 tags containg repo title, username and repo url
    return topic_doc

def get_repo_info(h3_tag):
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    return username,repo_name,repo_url


def get_topic_repos(topic_doc):
    #Get the h3 tags containg repo title, username and repo url
    h3_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3',{'class': h3_selection_class})
    
    topic_repos_dict = {
        'username' : [],
         'repo_name' : [],
         'repo_url' : [],
    }
     #Get repo info
    for i in range(len(repo_tags)):
            repo_info = get_repo_info(repo_tags[i])
            topic_repos_dict['username'].append(repo_info[0])
            topic_repos_dict['repo_name'].append(repo_info[1])
            topic_repos_dict['repo_url'].append(repo_info[2])
            
    return pd.DataFrame(topic_repos_dict) 

def scrape_topic(topic_url , path):
    if os.path.exists(path):
        print('The file {} already exists.skipping ...'.format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path ,index = None)

In [30]:
def get_topic_titles(doc):
    select_class="f3 lh-condensed mb-0 mt-1 Link--primary"
    topic_title_tags = doc.find_all('p',{'class': select_class})
    topic_titles = []
    for tag in topic_title_tags:
            topic_titles.append(tag.text)
    return topic_titles


def get_topic_descs(doc):
    desc_selector = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p',{'class': desc_selector})
    topic_descs = []
    for tag in topic_desc_tags:
            topic_descs.append(tag.text.strip())
    return topic_descs

def get_topic_urls(doc):
    topic_link_tags = doc.find_all('a' ,{'class':'no-underline flex-1 d-flex flex-column'})
    topic_urls = []
    base_url = 'https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
    return topic_urls  

def scrape_topics():
    topic_url = 'https://github.com/topics'
    response = requests.get(topic_url)
    if response.status_code != 200:
        raise Exception('Faild to load page {}'.format(topic_url))
    topics_dict = {
        'title' : get_topic_titles(doc),
        'description' : get_topic_descs(doc),
        'url' : get_topic_urls(doc)
    }    
    return pd.DataFrame(topics_dict)

In [31]:
def scrape_topics_repos():
    print('scraping list of topics')
    topics_df = scrape_topics()
    os.makedirs('data',exist_ok=True)
    for index, row in topics_df.iterrows():
        print('scraping top repositories for "{}"'.format(row['title']))
        scrape_topic(row['url'], 'data/{}.csv'.format(row['title']))
        

In [32]:
scrape_topics_repos()

scraping list of topics
scraping top repositories for "3D"
The file data/3D.csv already exists.skipping ...
scraping top repositories for "Ajax"
The file data/Ajax.csv already exists.skipping ...
scraping top repositories for "Algorithm"
The file data/Algorithm.csv already exists.skipping ...
scraping top repositories for "Amp"
The file data/Amp.csv already exists.skipping ...
scraping top repositories for "Android"
The file data/Android.csv already exists.skipping ...
scraping top repositories for "Angular"
The file data/Angular.csv already exists.skipping ...
scraping top repositories for "Ansible"
The file data/Ansible.csv already exists.skipping ...
scraping top repositories for "API"
The file data/API.csv already exists.skipping ...
scraping top repositories for "Arduino"
The file data/Arduino.csv already exists.skipping ...
scraping top repositories for "ASP.NET"
The file data/ASP.NET.csv already exists.skipping ...
scraping top repositories for "Atom"
The file data/Atom.csv alre