In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
import pandas as pd
import time
from bs4 import BeautifulSoup
import threading
from concurrent.futures import ThreadPoolExecutor
import multiprocessing

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
project_list = pd.read_excel('project_5000Up.xlsx')
project_list = project_list['Project URL'].tolist()
project_list = project_list[:100]

In [3]:
# Headless mode for Chrome
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-renderer-backgrounding")
chrome_options.add_argument("--disable-background-timer-throttling")
chrome_options.add_argument("--disable-backgrounding-occluded-windows")
chrome_options.add_argument("--disable-client-side-phishing-detection")
chrome_options.add_argument("--disable-crash-reporter")
chrome_options.add_argument("--disable-oopr-debug-crash-dump")
chrome_options.add_argument("--no-crash-upload")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-low-res-tiling")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--silent")
chrome_options.add_argument("--blink-settings=imagesEnabled=false")

#proxy_server_url = "104.251.224.95"
#chrome_options.add_argument(f'--proxy-server={proxy_server_url}')

In [4]:
def scrape_prs(project_url, driver):
    project = project_url[19:]
    # Pull Requests
    pull_url = project_url + "/pulls"
    
    for i in range(0,10):
        driver.get(pull_url)
        # Wait for the document to be in 'complete' state
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
        
        open_prs = soup.find(href=f"/{project}/pulls?q=is%3Aopen+is%3Apr")
        close_prs = soup.find(href=f"/{project}/pulls?q=is%3Apr+is%3Aclosed")
        if open_prs != None and close_prs != None:
            open_prs = open_prs.text.split()[0]
            close_prs = close_prs.text.split()[0]
            print(f"{project_url}: {open_prs} open_prs and {close_prs} close_prs")
            return [open_prs, close_prs]
        else:
            time.sleep(10)
    print(f"{project_url}: open_prs and close_prs not found")
    return [None, None]

In [5]:
def scrape_owner(project_url, driver):
    project = project_url[19:]
    creator = project.split('/')[0]
    
    # Verified Repo Owner
    owner_url = f"https://github.com/{creator}"
    driver.get(owner_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )    

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.prettify())

    verified = soup.find('summary', {'title': 'Label: Verified'})
    if verified != None:
        verified = verified.text.split()[0]
        
    print(f"{project_url} owner status: {verified}")

    # Number of Owner Followers
    followers = soup.find('a', class_='Link--secondary no-underline no-wrap')
    if followers != None:
        followers = followers.text.split()[0]
    print(f"{project_url} followers: {followers}")

    # Number of Owner Members
    members = soup.find('span', class_='Counter js-profile-member-count')

    if members != None:
        while members.text == "":
            driver.get(owner_url)
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.TAG_NAME, 'body'))
            )    
        
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            time.sleep(5)
            members = soup.find('span', class_='Counter js-profile-member-count')
        members = members.text.split()[0]
        
    print(f"{project_url} members: {members}")

    # Number of Other Repositories by Owner
    repositories = soup.find('span', class_='Counter js-profile-repository-count')
    if repositories == None:
        repositories = soup.find_all('span', class_='Counter')[0]
        
    if repositories != None:
        repositories = repositories.text.split()[0]
    print(f"{project_url} repositories: {repositories}")
    
    return [verified, followers, members, repositories]

In [6]:
def scrape_insight(project_url, driver):
    project = project_url[19:]
    creator = project.split('/')[0]
    
    # Active prs and active issues
    insight_url = f"{project_url}/pulse"
    driver.get(insight_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )    
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    active = soup.find_all('div', class_='mt-2')
    active_prs = active[0]
    active_issues = active[1]

    if active_prs != None:
        active_prs = active_prs.text.split()[0]

    if active_issues != None:
        active_issues = active_issues.text.split()[0]

    print(f"{project_url}: {active_prs} Active pull requests, {active_issues} Active issues")
    return [active_prs, active_issues]

In [7]:
def scrape_issues(project_url, driver):
    project = project_url[19:]
    # Issues
    issue_url = project_url + "/issues"

    for i in range(0,10):
        driver.get(issue_url)
    
        # Wait for the document to be in 'complete' state
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
    
        open_issues = soup.find(href=f"/{project}/issues?q=is%3Aopen+is%3Aissue") 
        closed_issues = soup.find(href=f"/{project}/issues?q=is%3Aissue+is%3Aclosed")

        num_labels = soup.find(href=f"/{project}/labels")
        num_milestones = soup.find(href=f"/{project}/milestones")
        
        if open_issues != None:
            open_issues = open_issues.text.split()[0]
            closed_issues = closed_issues.text.split()[0]
            num_labels = num_labels.find("span").text
            num_milestones = num_milestones.find("span").text
            break
        else:
            time.sleep(10)

    if type(num_labels) != int:
        # labels
        driver.get(project_url + '/labels')
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
        num_labels = soup.find('span', class_='js-labels-count')
        num_labels = num_labels.text.split()[0]

        # milestones
        driver.get(project_url + '/milestones')
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
        num_milestones = soup.find('a', class_='btn-link selected').text.split()[0]

    print(f"{project_url}, Open issues: {open_issues}, Closed issues: {closed_issues}, Labels: {num_labels}, Milestones: {num_milestones}")
    return [open_issues, closed_issues, num_labels, num_milestones]

In [8]:
####### IDEA: Merge Two Pandas DataFrames using Outer Join Merge ###########
# Potential Scrapes:
# ../issues => Open/closed issues, # of labels, # of milestones
# ../pulls
# ../actions => # of workflow runs
# ../pulse => Active pr and Active issues
# ../network/dependencies
# Sponsered
# Watches
# 
# Ones in Bash Script:
# Commits
# Tags/Releases
# Branches
# Languages
# Contributors

def scrape_page(project_url, driver):

    project_features = []
    
    # Get the OWNER/REPO
    project = project_url[19:]

    # Set up Web Driver
    driver.get(project_url)

    # Get number of watches and sponsors
    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )
    
    # Parse HTML
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    
    num_watches = soup.find(href=f"/{project}/watchers").find("strong").text
    
    creator = project.split('/')[0]
    sponsored = "Yes" if soup.find(href=f"/sponsors/{creator}") != None else "No"
    
    if sponsored == "Yes":
        #for i in range(0,10):
        driver.get(f"https://github.com/sponsors/{creator}")
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
        
        current_sponsors = soup.find(lambda tag: tag.name == 'h4' and 'Current sponsors' in tag.get_text())
        past_sponsors = soup.find(lambda tag: tag.name == 'h4' and 'Past sponsors' in tag.get_text())

        if current_sponsors == None:
            current_sponsors = soup.find('p', class_='f3-light color-fg-muted mb-3')
            current_sponsors = current_sponsors.text.split()[0]
            past_sponsors = 0
        else:
            current_sponsors = current_sponsors.text.split()[2]
            past_sponsors = past_sponsors.text.split()[2]

    else:
        current_sponsors = 0
        past_sponsors = 0

    print(f"{project_url}: {current_sponsors} Current sponsors, {past_sponsors} Past sponsors")
    project_features.append(sponsored)
    project_features.append(current_sponsors)
    project_features.append(past_sponsors)
    project_features.append(num_watches)

    # Number of Workflow Runs
    workflow_url = project_url + "/actions"
    driver.get(workflow_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    workflow = soup.find(lambda tag: tag.name == 'strong' and 'workflow runs' in tag.get_text())
    if workflow != None:
        workflow = workflow.text.split()[0]

    project_features.append(workflow)
    
    # Number of Dependent Repos
    dependent_url = project_url + "/network/dependents"
    driver.get(dependent_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    dependents = soup.find('a', class_='btn-link selected')
    if dependents != None:
        dependents = dependents.text.split()[0]
    
    project_features.append(dependents)
    print(f"{project_url} {dependents} dependents, {workflow} workflows")

    return project_features

In [26]:
def scrape_project(project_url):
    project = [project_url]
    
    driver = webdriver.Chrome(options=chrome_options)
    prs = scrape_prs(project_url, driver)
    time.sleep(1)
    owner = scrape_owner(project_url, driver)
    time.sleep(1)
    insight = scrape_insight(project_url, driver)
    time.sleep(1)
    issues = scrape_issues(project_url, driver)
    time.sleep(1)
    page = scrape_page(project_url, driver)

    driver.quit()
    
    project = project + prs + owner + insight + issues + page
    return project

In [27]:
def scrape_project_list(project_list):
    projects = []
    with ThreadPoolExecutor(max_workers=10) as p:
        features = p.map(scrape_project, project_list)
        for f in features:
            projects.append(f)
    return projects

In [28]:
start_time = time.time()
list_len = len(project_list)
sub_len = list_len // 3
sublists = [project_list[i:i+sub_len] for i in range(0, list_len, sub_len)]
pool = multiprocessing.Pool(processes=3)
results = pool.map(scrape_project_list, sublists)
projects = []
for result in results:
    projects.extend(result)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"elapsed time is {elapsed_time}")

https://github.com/996icu/996.ICU: 3 open_prs and 1,945 close_prs
https://github.com/ripienaar/free-for-dev: 1 open_prs and 3,154 close_prs
https://github.com/fatedier/frp: 4 open_prs and 831 close_prshttps://github.com/AUTOMATIC1111/stable-diffusion-webui: 11 open_prs and 2,844 close_prs

https://github.com/jwasham/coding-interview-university: 11 open_prs and 938 close_prs
https://github.com/codecrafters-io/build-your-own-x: 138 open_prs and 242 close_prs
https://github.com/microsoft/Web-Dev-For-Beginners: 55 open_prs and 780 close_prs
https://github.com/Chalarangelo/30-seconds-of-code: 0 open_prs and 1,604 close_prs
https://github.com/kamranahmedse/developer-roadmap: 323 open_prs and 2,693 close_prshttps://github.com/papers-we-love/papers-we-love: 1 open_prs and 509 close_prs

https://github.com/labuladong/fucking-algorithm: 20 open_prs and 448 close_prs
https://github.com/521xueweihan/HelloGitHub: 2 open_prs and 40 close_prs
https://github.com/nvbn/thefuck: 83 open_prs and 605 close

In [31]:
projects_df = pd.DataFrame(projects, columns=['Project URL',
                                              'Open Pull Requests',
                                              'Closed Pull Requests',
                                              'Verified Owner',
                                              'Followers of Owner',
                                              'Members of Owner',
                                              'Repos of Owner',
                                              'Active Pull Requests', 
                                              'Active Issues',
                                              'Open Issues',
                                              'Closed Issues',
                                              'Number of Labels',
                                              'Number of Milestones',
                                              'Sponsored',
                                              'Current Sponsors',
                                              'Past Sponsors',
                                              'Number of Watches',
                                              'Number of Workflow Runs',
                                              'Number of Dependents'
                                             ])

In [32]:
projects_df

Unnamed: 0,Project URL,Open Pull Requests,Closed Pull Requests,Verified Owner,Followers of Owner,Members of Owner,Repos of Owner,Active Pull Requests,Active Issues,Open Issues,Closed Issues,Number of Labels,Number of Milestones,Sponsored,Current Sponsors,Past Sponsors,Number of Watches,Number of Workflow Runs,Number of Dependents
0,https://github.com/freeCodeCamp/freeCodeCamp,78,35716,Verified,22k,82,220,45,29,274,17490,44,4,Yes,55,163,8.5k,134121,25
1,https://github.com/EbookFoundation/free-progra...,15,9859,,6.8k,10,33,4,3,28,1043,73,0,Yes,13,46,9.7k,15500,21
2,https://github.com/sindresorhus/awesome,24,1982,,65.8k,,1.1k,2,2,24,313,8,0,Yes,259,1177,7.7k,256,24
3,https://github.com/public-apis/public-apis,246,2883,,,,1,4,0,2,563,21,0,No,0,0,4.1k,554,2
4,https://github.com/jwasham/coding-interview-un...,11,938,,21.8k,,31,0,0,45,370,12,0,Yes,2,15,8.6k,14,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,https://github.com/florinpop17/app-ideas,341,323,,13.3k,,90,0,0,102,42,7,0,No,0,0,1.8k,,19
96,https://github.com/doocs/advanced-java,0,118,,3.6k,401,15,0,0,4,141,13,0,No,0,0,2.6k,939,0
97,https://github.com/FortAwesome/Font-Awesome,29,654,Verified,1.1k,14,39,0,7,5469,13801,135,8,No,0,0,1.4k,,719634
98,https://github.com/spring-projects/spring-boot,39,6133,Verified,10k,32,82,2,64,602,32903,49,9,No,0,0,3.4k,5851,19


try:
    with pd.ExcelWriter(
        "project_HTMLfeatures.xlsx",
        mode="a",
        engine="openpyxl",
        if_sheet_exists="overlay",
    ) as writer:
         projects_df.to_excel(writer,sheet_name="Sheet1", startrow=writer.sheets["Sheet1"].max_row, index = False,header= False)
except FileNotFoundError:
    projects_df.to_excel("project_HTMLfeatures.xlsx", index=False)