In [3]:
from bs4 import BeautifulSoup
from cleantext import clean
import requests
import csv
import xlwings

Opens a CSV file and writes the headers for the columns.

In [4]:
def start_csv_file():
    filename = "C:/Users/john_/projects/GithubWebScrape/trending_repositories.csv"
    csv_file = open(file=filename, mode='w', newline='')
    writer = csv.writer(csv_file)
    headers = ('Repository', 'Description', 'Primary Programming Language', 'Stargazers', 'Forks', 'Stared Today')
    writer.writerow(headers)
    csv_file.close()

Opens the CSV file in append mode and writes a row of data.

In [5]:
def append_to_csv(link, description, prog_lang, stars, forks, stars_today):
    csv_file = open(file='trending_repositories.csv', mode='a', newline='')
    writer = csv.writer(csv_file)
    row = (link, description, prog_lang, stars, forks, stars_today)
    writer.writerow(row)
    csv_file.close()

URL of the page to scrape.

In [6]:
url = 'https://github.com/trending?since=daily&spoken_language_code=en'

Prevents the excel workbook file from appearing on screen while it executes the macro

In [7]:
xlwings.App(visible=False)

<App [excel] 11200>

Get the HTML content of the page

In [8]:
html_text = requests.get(url).text

Create a BeautifulSoup object to parse the HTML

In [9]:
soup = BeautifulSoup(html_text, 'lxml')

Find all the repository articles on the page

In [10]:
repositories = soup.find_all('article', {'class': 'Box-row'})

Start the CSV file with headers

In [11]:
start_csv_file()

For each repository in repositories:
    try:
        Extract relevant information from the repository
    except:
        Print error message if there is insufficient data for a repository
    else:
        Print the extracted information and append it to the CSV file

In [12]:
for repository in repositories:
    try:
        url_prefix = 'https://github.com'
        link = url_prefix + repository.h2.a['href']
        description = repository.find('p', class_='col-9 color-fg-muted my-1 pr-4').text.strip()
        description = clean(description, no_emoji=True, lower=False,)
        prog_lang = repository.find('span', {'itemprop': 'programmingLanguage'}).text.strip()
        stars_forks = repository.find_all('a', class_='Link Link--muted d-inline-block mr-3')
        stars = stars_forks[0].text.strip()
        forks = stars_forks[1].text.strip()
        stars_today = repository.find('span', class_='d-inline-block float-sm-right').text.strip()
        sub_stars_today = stars_today[0: stars_today.find(' ')]
    except:
        print('\n\nInsufficient data.. Skipping repository.\n')
    else:
        print('\n'+link)
        print('Discription: ' + description)
        print('Primary Programming Language: ' + prog_lang)
        print('Stargazers: ' + stars)
        print('Fork: ' + forks)
        print('Stars Gained Today: ' + sub_stars_today)
        append_to_csv(link, description, prog_lang, stars, forks, sub_stars_today)


https://github.com/coqui-ai/TTS
Discription: - a deep learning toolkit for Text-to-Speech, battle-tested in research and production
Primary Programming Language: Python
Stargazers: 16,709
Fork: 2,042
Stars Gained Today: 1,285

https://github.com/AUTOMATIC1111/stable-diffusion-webui
Discription: Stable Diffusion web UI
Primary Programming Language: Python
Stargazers: 102,437
Fork: 20,455
Stars Gained Today: 115

https://github.com/godotengine/godot-demo-projects
Discription: Demonstration and Template Projects
Primary Programming Language: GDScript
Stargazers: 3,869
Fork: 1,336
Stars Gained Today: 65

https://github.com/ripienaar/free-for-dev
Discription: A list of SaaS, PaaS and IaaS offerings that have free tiers of interest to devops and infradev
Primary Programming Language: HTML
Stargazers: 73,155
Fork: 8,030
Stars Gained Today: 30

https://github.com/SerenityOS/serenity
Discription: The Serenity Operating System
Primary Programming Language: C++
Stargazers: 25,980
Fork: 2,828
Sta

Open the Excel workbook and execute the macro

In [13]:
wb = xlwings.Book('trending_repositories.xlsm')
macro = wb.macro('Sheet1.ImportCSVData')
macro()
wb.save()
wb.close()