### ETL top repositories data from github 

#### A simple web content scrapping process using python 

### Steps
### Scrape github.com/topics
#### Get the list of topics, for each topics get the topic title, topic page URL and topic description
#### For each topic get the first 25 repositories in the topic from the top page
#### For each repo, grab the repo name, username, stars and the repo URL
#### For each topic create a CSV file in this format

#### Repo name, username, stars, repo URL

#### Extract Data

In [38]:
# import libraries
import requests
from bs4 import BeautifulSoup

##### Using requests library to scrape website and download webpages 
##### BeautifulSoup to parse the contents

In [None]:
def get_topics_page():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)    
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    file = BeautifulSoup(response.text, 'lxml')
    return file

In [None]:
#file = get_topics_page()
#file.find('a')

In [57]:
# parse information from the webpage 
# define helper functions to get the list of all topic titles, topic descriptions and topic urls from the topics_page

In [None]:
#to get list of titles
def get_topic_titles(file):
    topic_title_tags = file.find_all('p', {'class': 'f3 lh-condensed mb-0 mt-1 Link--primary'}) 
    
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles
    #print(topic_titles)
   


In [73]:
topics = get_topic_titles(file)

In [74]:
topics[0]

'3D'

In [None]:
# to get list of all descriptions
def get_topic_descs(file):
    desc_selector ='f5 color-fg-muted mb-0 mt-1'
    description_tags = file.find_all('p', {'class':desc_selector })    
    topic_descs = []
    for tag in description_tags:
        topic_descs.append(tag.text.strip())
    return topic_descs
    #print(topic_descs[:5])
    

In [62]:
descs = get_topic_descs(file)

In [71]:
descs[0]

'3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.'

In [88]:
# to get list of all urls
def get_topic_url(file):
    topic_link_tags = file.find_all( 'a', {'class': 'no-underline flex-1 d-flex flex-column'})

    topic_urls = []
    base_urls = 'https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_urls + tag['href'])
    return topic_urls
    #print(topic_urls)

In [68]:
urls = get_topic_url(file)

In [72]:
urls[0]

'https://github.com/topics/3d'

In [None]:
# scrape topics and create a single pandas dataframe

def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)    
        
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    file = BeautifulSoup(response.text, 'lxml')
    
    topic_dict = {
        'title': get_topic_title(file),
        'description' : get_topic_descs(file),
        'url' : get_topic_url(file)
        
    }
    
    return pd.DataFrame(topic_dict)

#### Transform

##### To get the top repositories from topics_page

In [None]:
# get topic page
def get_topic_page(topic_url):
    response = requests.get(topic_page_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    
    topic_page = BeautifulSoup(response.text, 'lxml')
    return topic_page


In [None]:
page = get_topic_page('https://github.com/topics/3d')

In [None]:
# get repo details
def get_repo_detail(h3_tag, star_ratings):
    # to get all required details about repository
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url =  base_urls + a_tags[1]['href']
    stars = parse_star_count(star_ratings.text.strip())
    return username, repo_name, repo_url, stars


In [None]:
get_repo_detail(repo_user_tags[0], star_rating[0])

In [79]:
#import pandas
import pandas as pd

In [85]:
def get_repo_info(topic_page):
    # get tags containing repo title, repo_url and username
    catg = 'f3 color-fg-muted text-normal lh-condensed'
    repo_user_tags = topic_page.find_all('h3', {'class':catg })
    # get the star tags
    cat = 'Counter js-social-count'
    star_rating = topic_page.find_all('span', {'class':cat})
    
    topic_repos_dict = {
        'username' : [],
        'repo_name' : [],
        'repo_url' : [],
        'stars' : []
    }
    
    #get the repo info
    for i in range(len(repo_user_tags)):
        repo_info = get_repo_detail(repo_user_tags[i], star_rating[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['repo_url'].append(repo_info[2])
        topic_repos_dict['stars'].append(repo_info[3])
    return pd.DataFrame(topic_repos_dict)

#### Load data

In [90]:
def scrape_topic(topic_url, topic_name):
    fname = topic_name + '.csv'
    if os.path.exists(fname):
        print("This file {} already exist, skipping...".format(fname))
        return
    topic_df = get_repo_info(get_topic_page(topic_url))
    topic_df.to_csv(fname, index=None)

##### Getting everything together

In [143]:
# to scrape all the top repos for the topics on first page
def scrape_topics_repos():
    print('Scraping topics from github')
    topics_df = scrape_topics()
    for index, row in topics_df.iterrows():
        print('Scraping github for top repos "{}"'.format(row['title']))
        scrape_topic(row['url'], row['title'])
    

In [None]:
#scrape_topics_repos()

In [None]:
#pd.read_csv('data/Android.csv')

In [None]:
# to scrape / write an image file

In [None]:
#img_tag = file.find_all('img') 
#img_url = img_tag[0]['src']


In [None]:
#r = requests.get(img_url)
# ( r = requests.get('https://repository-imagies.githubusercontent.com/123456789') )


In [None]:
#with open( image.jpg , 'wb') as f:
#    file(r.content)
