# Web Scraping for top Repos in Github Topics

 ### Information
 
 Web scraping is the process of extracting and parsing data from websites in an automated fashion using a computer program. It's a useful technique for creating datasets for research and learning

### project outline:

- We're going to scrape https://github.com/topics
- We'll get a list of topics. For each topic, we'll get topic title, topic page URL and topic description
- For each topic, we'll get the top 25 repositories in the topic from the topic page
- For each repository, we'll grab the repo name, username, stars and repo URL 
- after the data is collected the collected data stored in data folder with title name .csv format

### Installing  required libraries

In [None]:
!pip install requests --quiet

In [None]:
!pip install beautifulsoup4 --quiet

In [None]:
!pip install pandas --quiet

### importing required libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

### Using request library to download the web page

In [None]:
def scrap_topic_url(url):
    topics_url = requests.get(url)
    
    if topics_url.status_code != 200:
        raise Exception(f"Failed to load web page {topics_url}")
    return topics_url.text

###  Using BeautifulSoup to  parsing and extracting information from web page

In [None]:
def topic_tags_info(doc):
    selected_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_p_tags = doc.find_all('p', {'class': selected_class})
    
    desc_class = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc = doc.find_all('p', {'class': desc_class})
    
    topic_a_tag = 'no-underline flex-1 d-flex flex-column'
    topic_link_tags = doc.find_all('a', {'class': topic_a_tag})
    
    return topic_p_tags, topic_desc, topic_link_tags

### Scraping topic information (Title, Description, URL)

In [None]:
def topic_info(topic_p_tags, topic_desc, topic_link_tags):
    
    # extracting information from the tags
    topic_titles = [tags.text for tags in topic_p_tags]
    topic_description = [tags.text.strip() for tags in topic_desc]
    base_url = 'https://github.com'
    topic_url = [base_url + tags['href'] for tags in topic_link_tags]
    
    #storing information
    topic_information = {
    'Title': topic_titles,
    'Description': topic_description,
    'URL': topic_url
    }
    return topic_information

### Creating csv file with extracted information using pandas

In [None]:
def create_topic_csv(topic_information):
    
    #storing information in .csv format
    topics_df = pd.DataFrame(topic_information)
    topics_df.to_csv('topics.csv', index = None)
    print("Created topics.csv file")

## Getting information out of topic page
- open topics in github topic page
- collect the  top repos username, repo_name, repo_url, stars in each topic 

### Checking topics pages 

In [None]:
def get_topic_page(list_topic_url):
    
    # sending request to the site
    response = requests.get(list_topic_url)
    
    #checking response
    if response.status_code != 200:
        raise Exception('Failed to load web page')
    get_topic_pages = BeautifulSoup(response.text, 'html.parser')

    return get_topic_pages


### getting information about the tags needed to extract the information
[![rHq2Hx.png](https://i.im.ge/2022/06/11/rHq2Hx.png)](https://im.ge/i/rHq2Hx)

In [None]:
def get_tags(get_topic,i):
    
    #anchor tags for username, repo_url, repo_name
    selected_h3_class = 'f3 color-fg-muted text-normal lh-condensed'
    h3_tags = get_topic.find_all('h3', {'class': selected_h3_class})
    
    # tags to get star count
    selected_span_tag = 'repo-stars-counter-star'
    star_tag = get_topic.find_all('span', {'id':selected_span_tag})
    return h3_tags, star_tag

### Collecting each repo information

In [None]:
def get_repo_info(h3_anchor_tags, star_tag, i):
    
    #collecting username, repo_name,repo_url
    username = h3_anchor_tags[0].text.strip()
    repo_name =h3_anchor_tags[1].text.strip()
    base_url = 'https://github.com'
    repo_url = base_url + h3_anchor_tags[1]['href']
    
    #counting total number of stars
    star_count = star_tag[i]['title']
    val = ''
    for i in star_count:
        if i != ',':
            val +=i
    star_count = int(val)
    return username, repo_name, repo_url, star_count

### scraping topics
- scrap each topic url repo info 
- saving the information in title.csv format
- storing each file in data folder

In [None]:
def scrap_repo_info(dict_topics):
    print(f'Scriping all topics')
    for i in range(len(dict_topics['URL'])):
        info = get_topic_page(dict_topics['URL'][i])

        h3_tags, star_values = get_tags(info, i)

        topic_dict_info = {
            'username': [],
            'repo_name': [],
            'repo_url': [],
            'star_count': []
        }


        for j in range(len(h3_tags)):
            h3_anchor_tags = h3_tags[j].find_all('a')
            topic_info = get_repo_info(h3_anchor_tags,star_values, j)
            topic_dict_info['username'].append(topic_info[0])
            topic_dict_info['repo_name'].append(topic_info[1])
            topic_dict_info['repo_url'].append(topic_info[2])
            topic_dict_info['star_count'].append(topic_info[3])
        # print(dict_topics['Title'][i])
        title = dict_topics['Title'][i]
        print(f"scraping top repo for {title}")

        # creating data directory 
        os.makedirs('data', exist_ok=True)
        fname = 'data/'+title + '.csv'

        #checking if files are already exist
        if os.path.exists(fname):
            print(f'file {title}.csv already exist. skipping..')
            continue

        topic_df = pd.DataFrame(topic_dict_info)

        # storing files in data folder with title.csv format
        topic_df.to_csv('data/' + title+'.csv', index = None)


### combining all functions together

In [None]:
def scrap(url):
    topic_url = scrap_topic_url(url)
    doc = BeautifulSoup(topic_url, 'html.parser')
    topic_tags = topic_tags_info(doc)
    topic_information = topic_info(topic_tags[0], topic_tags[1], topic_tags[2])
    create_topic_csv(topic_information)
    scrap_repo_info(topic_information)

In [None]:
url = "https://github.com/topics"

In [None]:
scrap(url)