# Scraping Top Repositories For Trending Github Topics

In [1]:
# Importing required libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Project Outline

- In this project we are going to scrape https://github.com/topics
- We are going to extract the trending github topics and for each topic we will extract the topic title, topic description and topic URL
- For each topic we will extract 20 top repositories 
- For each repositories, we'll grab the repository name, its username, no. of stars the repository received, and the  repository URL
- For each topic we'll create a CSV file.

In [2]:
base_url = ' https://github.com'
main_url = 'https://github.com/topics'

In [3]:
'''
Main objective of this get_all_topics function is to parse through all the pages of github_topics and return 3 things
1. Topic_titles
2. Topic_description
3. Topic_urls
'''

def get_all_topics(main_url):
    
    # There are a total of 6 github-topics pages. Hence variable i is used to parse all 6 pages.
    i = 1
    
    topic_titles = []
    topic_description = []
    topic_urls = []
    
    for i in range(1,7):
        filters = {'page': i}
        
        # Sending the GET request to github-topics webpage
        response = requests.get(main_url, params=filters)
        
        # If GET request not made successfully then terminate the code else continue
        if response.status_code != 200:
            raise Exception(f'Content did not get loaded for page no. {i}')
        
        # Doc has the content of the page from which we will extract information
        
        doc = BeautifulSoup(response.text, 'html.parser')
        
        '''
        Now we need to extract 3 things here. Topic title, Topic description and Topic url.
        '''
        
        # To extract the title of each github topic
        topic_title_tags = doc.find_all('p', {'class': 'f3 lh-condensed mb-0 mt-1 Link--primary'})
        
        # Extracting the text of topic title and appending it in the topic_titles list created.
        for tag in topic_title_tags:
            topic_titles.append(tag.text)
        
        # To extract the description of each github topic
        topic_desc_tags = doc.find_all('p', {'class': 'f5 color-fg-muted mb-0 mt-1'})

        # Extracting the text of topic description and appending it in the topic_description list created.
        for tag in topic_desc_tags:
            topic_description.append(tag.text.strip())
        
        # To extract the link of each github topic
        topic_link_tags = doc.find_all('a', {'class': 'no-underline flex-1 d-flex flex-column'})
        
        # Extracting the link of each topic and appending it in the topic_urls list created.
        for tag in topic_link_tags:
            topic_urls.append(base_url + tag['href'])
            
    return topic_titles, topic_description, topic_urls

In [4]:
# Calling the get_all_topics function
topic_titles, topic_description, topic_urls = get_all_topics(main_url)

In [89]:
topic_titles

['3D',
 'Ajax',
 'Algorithm',
 'Amp',
 'Android',
 'Angular',
 'Ansible',
 'API',
 'Arduino',
 'ASP.NET',
 'Atom',
 'Awesome Lists',
 'Amazon Web Services',
 'Azure',
 'Babel',
 'Bash',
 'Bitcoin',
 'Bootstrap',
 'Bot',
 'C',
 'Chrome',
 'Chrome extension',
 'Command line interface',
 'Clojure',
 'Code quality',
 'Code review',
 'Compiler',
 'Continuous integration',
 'COVID-19',
 'C++',
 'Cryptocurrency',
 'Crystal',
 'C#',
 'CSS',
 'Data structures',
 'Data visualization',
 'Database',
 'Deep learning',
 'Dependency management',
 'Deployment',
 'Django',
 'Docker',
 'Documentation',
 '.NET',
 'Electron',
 'Elixir',
 'Emacs',
 'Ember',
 'Emoji',
 'Emulator',
 'ESLint',
 'Ethereum',
 'Express',
 'Firebase',
 'Firefox',
 'Flask',
 'Font',
 'Framework',
 'Front end',
 'Game engine',
 'Git',
 'GitHub API',
 'Go',
 'Google',
 'Gradle',
 'GraphQL',
 'Gulp',
 'Hacktoberfest',
 'Haskell',
 'Homebrew',
 'Homebridge',
 'HTML',
 'HTTP',
 'Icon font',
 'iOS',
 'IPFS',
 'Java',
 'JavaScript',
 'Je

### Creating a CSV file to store all three of the github-topics information

In [5]:
# Binding the 3 lists on topics into a dictionary
all_topics_dict = {'title': topic_titles, 'description': topic_description, 'url': topic_urls}

# Converting the dictionary into a dataframe
all_topics_df = pd.DataFrame(all_topics_dict)

all_topics_df.to_csv('all_topics.csv', index=None)

In [6]:
res = requests.get(topic_urls[0])

In [7]:
repo = BeautifulSoup(res.text, 'html.parser')

In [8]:
repo_tags = repo.find_all('h3', {'class': 'f3 color-fg-muted text-normal lh-condensed'})

In [9]:
len(repo_tags)

20

In [39]:
print(repo_tags[0].find_all('a')[1]['href'])

/mrdoob/three.js


In [50]:
star_tags = repo.find_all('span', {'class': 'Counter js-social-count'})

In [56]:
star_tags[0].text

'90.6k'

In [61]:
def get_star_num(str):
    if str[-1] == 'k':
        return int(float(str[:-1])*1000)
    return int(str)

In [103]:
def create_csv(repo_names, repo_usernames, stars, repo_urls, topic_title_value):
    
    all_repo_dict = {'Repository Name': repo_names,
                     'Repository Username': repo_usernames, 
                     'Stars Received': stars, 
                     'Repository URL': repo_urls}
    
    all_repo_df = pd.DataFrame(all_repo_dict)
    
    all_repo_df.to_csv(f'{topic_title_value}.csv', index=None)

In [107]:
'''
Under the get_all_repo function we are looping through each topic under
topic_urls to get the top 20 repository details under each topic.

Under get_all_repo() we extract the following repository details for all the github-topics
1. Repository Name
2. Repository Username
3. No. of stars repository received
4. Repository URL

'''

def get_all_repo(topic_urls, topic_titles):
    
    try:
        
        # To loop through each links under topic_urls
        for i in range(len(topic_urls)):
        
            topic_title_value = topic_titles[i]
        
            # Sending GET request for topic_url
            res = requests.get(topic_urls[i])
        
            if res.status_code != 200:
                raise Exception(f'Request to {topic_urls[i]} failed with status code {res.status_code}')
        
            # To pasre each topic to extract top 20 repository under each topic
            repo = BeautifulSoup(res.text, 'html.parser')
        
            # repo_tags contains info about repo name, username and repo URL. Lets extract them one by one 
            repo_tags = repo.find_all('h3', {'class': 'f3 color-fg-muted text-normal lh-condensed'})
        
            # To append all the repository names for each topic
            repo_names = []
            repo_usernames = []
            repo_urls = []
            stars = []
        
            for i in range(len(repo_tags)):
                repo_names.append(repo_tags[i].find_all('a')[1].text.strip())
        
            for i in range(len(repo_tags)):
                repo_usernames.append(repo_tags[i].find_all('a')[0].text.strip())
        
            for i in range(len(repo_tags)):
                repo_urls.append(base_url + repo_tags[i].find_all('a')[1]['href'])
            
            # star_tags contains info about the stars repository received
            star_tags = repo.find_all('span', {'class': 'Counter js-social-count'})
        
            for i in range(len(star_tags)):
                temp = get_star_num(star_tags[i].text)
                stars.append(temp)
                
            create_csv(repo_names, repo_usernames, stars, repo_urls, topic_title_value)
        
            repo_names = []
            repo_usernames = []
            repo_urls = []
            stars = []
        
    except:
        print("Connection refused by the server..")
        print("Let me sleep for 5 seconds")
        print("ZZzzzz...")
        time.sleep(5)
        print("Was a nice sleep, now let me continue...")

In [108]:
get_all_repo(topic_urls, topic_titles)