In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
import pandas as pd
import time
from bs4 import BeautifulSoup
import threading
from concurrent.futures import ThreadPoolExecutor
import multiprocessing

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
project_list = pd.read_excel('project_5000Up.xlsx')
project_list = project_list['Project URL'].tolist()
project_list = project_list[:100]

In [3]:
# Headless mode for Chrome
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-renderer-backgrounding")
chrome_options.add_argument("--disable-background-timer-throttling")
chrome_options.add_argument("--disable-backgrounding-occluded-windows")
chrome_options.add_argument("--disable-client-side-phishing-detection")
chrome_options.add_argument("--disable-crash-reporter")
chrome_options.add_argument("--disable-oopr-debug-crash-dump")
chrome_options.add_argument("--no-crash-upload")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-low-res-tiling")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--silent")
chrome_options.add_argument("--blink-settings=imagesEnabled=false")

#proxy_server_url = "104.251.224.95"
#chrome_options.add_argument(f'--proxy-server={proxy_server_url}')

In [4]:
def scrape_prs(project_url):
    driver = webdriver.Chrome(options=chrome_options)
    project = project_url[19:]
    # Pull Requests
    pull_url = project_url + "/pulls"
    
    for i in range(0,10):
        driver.get(pull_url)
        # Wait for the document to be in 'complete' state
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
        
        open_prs = soup.find(href=f"/{project}/pulls?q=is%3Aopen+is%3Apr")
        close_prs = soup.find(href=f"/{project}/pulls?q=is%3Apr+is%3Aclosed")
        if open_prs != None and close_prs != None:
            open_prs = open_prs.text.split()[0]
            close_prs = close_prs.text.split()[0]
            print(f"{project_url}: {open_prs} open_prs and {close_prs} close_prs")
            driver.quit()
            return [project_url, open_prs, close_prs]
        else:
            time.sleep(10)
    driver.quit()
    print(f"{project_url}: open_prs and close_prs not found")
    return [project_url, None, None]

In [5]:
def scrape_owner(project_url):
    driver = webdriver.Chrome(options=chrome_options)    
    project = project_url[19:]
    creator = project.split('/')[0]
    
    # Verified Repo Owner
    owner_url = f"https://github.com/{creator}"
    driver.get(owner_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )    

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.prettify())

    verified = soup.find('summary', {'title': 'Label: Verified'})
    if verified != None:
        verified = verified.text.split()[0]
        
    print(f"{project_url} owner status: {verified}")

    # Number of Owner Followers
    followers = soup.find('a', class_='Link--secondary no-underline no-wrap')
    if followers != None:
        followers = followers.text.split()[0]
    print(f"{project_url} followers: {followers}")

    # Number of Owner Members
    members = soup.find('span', class_='Counter js-profile-member-count')

    if members != None:
        while members.text == "":
            driver.get(owner_url)
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.TAG_NAME, 'body'))
            )    
        
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            time.sleep(5)
            members = soup.find('span', class_='Counter js-profile-member-count')
        members = members.text.split()[0]
        
    print(f"{project_url} members: {members}")

    # Number of Other Repositories by Owner
    repositories = soup.find('span', class_='Counter js-profile-repository-count')
    if repositories != None:
        repositories = repositories.text.split()[0]
    print(f"{project_url} repositories: {repositories}")
    
    driver.quit()
    return [project_url, verified, followers, members, repositories]

In [6]:
def scrape_insight(project_url):
    driver = webdriver.Chrome(options=chrome_options)
    project = project_url[19:]
    creator = project.split('/')[0]
    
    # Active prs and active issues
    insight_url = f"{project_url}/pulse"
    driver.get(insight_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )    
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    active = soup.find_all('div', class_='mt-2')
    active_prs = active[0]
    active_issues = active[1]

    if active_prs != None:
        active_prs = active_prs.text.split()[0]

    if active_issues != None:
        active_issues = active_issues.text.split()[0]

    print(f"{project_url}: {active_prs} Active pull requests, {active_issues} Active issues")
    driver.quit()
    return [project_url, active_prs, active_issues]

In [7]:
def scrape_issues(project_url):
    project = project_url[19:]
    driver = webdriver.Chrome(options=chrome_options)
    # Issues
    issue_url = project_url + "/issues"

    for i in range(0,10):
        driver.get(issue_url)
    
        # Wait for the document to be in 'complete' state
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
    
        open_issues = soup.find(href=f"/{project}/issues?q=is%3Aopen+is%3Aissue") 
        closed_issues = soup.find(href=f"/{project}/issues?q=is%3Aissue+is%3Aclosed")

        num_labels = soup.find(href=f"/{project}/labels")
        num_milestones = soup.find(href=f"/{project}/milestones")
        
        if open_issues != None:
            open_issues = open_issues.text.split()[0]
            closed_issues = closed_issues.text.split()[0]
            num_labels = num_labels.find("span").text
            num_milestones = num_milestones.find("span").text
            break
        else:
            time.sleep(10)

    if type(num_labels) != int:
        # labels
        driver.get(project_url + '/labels')
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
        num_labels = soup.find('span', class_='js-labels-count')
        num_labels = num_labels.text.split()[0]

        # milestones
        driver.get(project_url + '/milestones')
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
        num_milestones = soup.find('a', class_='btn-link selected').text.split()[0]

    print(f"{project_url}, Open issues: {open_issues}, Closed issues: {closed_issues}, Labels: {num_labels}, Milestones: {num_milestones}")
    driver.quit()
    return [project_url, open_issues, closed_issues, num_labels, num_milestones]

In [8]:
####### IDEA: Merge Two Pandas DataFrames using Outer Join Merge ###########
# Potential Scrapes:
# ../issues => Open/closed issues, # of labels, # of milestones
# ../pulls
# ../actions => # of workflow runs
# ../pulse => Active pr and Active issues
# ../network/dependencies
# Sponsered
# Watches
# 
# Ones in Bash Script:
# Commits
# Tags/Releases
# Branches
# Languages
# Contributors

def scrape_page(project_url):

    project_features = []
    
    # Add url to list
    project_features.append(project_url)
    
    # Get the OWNER/REPO
    project = project_url[19:]

    # Set up Web Driver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(project_url)

    # Get number of watches and sponsors
    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )
    
    # Parse HTML
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    
    num_watches = soup.find(href=f"/{project}/watchers").find("strong").text
    
    creator = project.split('/')[0]
    sponsored = "Yes" if soup.find(href=f"/sponsors/{creator}") != None else "No"
    
    if sponsored == "Yes":
        #for i in range(0,10):
        driver.get(f"https://github.com/sponsors/{creator}")
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.TAG_NAME, 'body'))
        )
        html = driver.page_source
        soup = BeautifulSoup(html,"html.parser")
        
        current_sponsors = soup.find(lambda tag: tag.name == 'h4' and 'Current sponsors' in tag.get_text())
        past_sponsors = soup.find(lambda tag: tag.name == 'h4' and 'Past sponsors' in tag.get_text())

            #if current_sponsors != None:
            #    print(f"{project_url} trying again to find sponsors")
            #    break
            #else:
            #    time.sleep(10)

        if current_sponsors == None:
            current_sponsors = soup.find('p', class_='f3-light color-fg-muted mb-3')
            current_sponsors = current_sponsors.text.split()[0]
            past_sponsors = 0
        else:
            current_sponsors = current_sponsors.text.split()[2]
            past_sponsors = past_sponsors.text.split()[2]

    else:
        current_sponsors = 0
        past_sponsors = 0

    print(f"{project_url}: {current_sponsors} Current sponsors, {past_sponsors} Past sponsors")
    project_features.append(sponsored)
    project_features.append(current_sponsors)
    project_features.append(past_sponsors)
    project_features.append(num_watches)

    # Number of Workflow Runs
    workflow_url = project_url + "/actions"
    driver.get(workflow_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    workflow = soup.find(lambda tag: tag.name == 'strong' and 'workflow runs' in tag.get_text())
    if workflow != None:
        workflow = workflow.text.split()[0]

    project_features.append(workflow)
    
    # Number of Dependent Repos
    dependent_url = project_url + "/network/dependents"
    driver.get(dependent_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    dependents = soup.find('a', class_='btn-link selected')
    if dependents != None:
        dependents = dependents.text.split()[0]
    
    project_features.append(dependents)
    print(f"{project_url} {dependents} dependents, {workflow} workflows")
        
    # Clean close the Web Session and window(s)
    driver.quit()

    return project_features

In [9]:
def prs():
    project_prs = []
    with ThreadPoolExecutor(max_workers=10) as p:
        features = p.map(scrape_prs, project_list)
        for f in features:
            project_prs.append(f)
    return project_prs

def owner():
    project_owner = []
    with ThreadPoolExecutor(max_workers=10) as p:
        features = p.map(scrape_owner, project_list)
        for f in features:
            project_owner.append(f)
    return project_owner

def insight():
    project_insight = []
    with ThreadPoolExecutor(max_workers=10) as p:
        features = p.map(scrape_insight, project_list)
        for f in features:
            project_insight.append(f)
    return project_insight

def issues():
    project_issues = []
    with ThreadPoolExecutor(max_workers=10) as p:
        features = p.map(scrape_issues, project_list)
        for f in features:
            project_issues.append(f)
    return project_issues

def page():
    projects = []
    with ThreadPoolExecutor(max_workers=10) as p:
        features = p.map(scrape_page, project_list)
        for f in features:
            projects.append(f)
    return projects

In [10]:
time.sleep(1)

In [11]:
with multiprocessing.Pool(processes=3) as pool:
    project_prs = pool.apply_async(prs)
    project_owner = pool.apply_async(owner)
    project_insight = pool.apply_async(insight)
    project_issues = pool.apply_async(issues)
    projects = pool.apply_async(page)

    project_prs = project_prs.get()
    project_owner = project_owner.get()
    project_insight = project_insight.get()
    project_issues = project_issues.get()
    projects = projects.get()

https://github.com/public-apis/public-apis owner status: None
https://github.com/public-apis/public-apis followers: None
https://github.com/public-apis/public-apis members: None
https://github.com/public-apis/public-apis repositories: 1
https://github.com/sindresorhus/awesome: 2 Active pull requests, 1 Active issues
https://github.com/donnemartin/system-design-primer: 0 Active pull requests, 0 Active issues
https://github.com/996icu/996.ICU: 3 open_prs and 1,970 close_prshttps://github.com/996icu/996.ICU: 0 Active pull requests, 0 Active issueshttps://github.com/codecrafters-io/build-your-own-x: 2 Active pull requests, 1 Active issueshttps://github.com/EbookFoundation/free-programming-books: 5 Active pull requests, 1 Active issues


https://github.com/jwasham/coding-interview-university: 0 Active pull requests, 0 Active issueshttps://github.com/public-apis/public-apis: 6 Active pull requests, 17 Active issues


https://github.com/kamranahmedse/developer-roadmap: 21 Active pull requests

while len(error_projects) != 0:
    print(len(error_projects))
    with ThreadPoolExecutor(max_workers=10) as p:
        features = p.map(scrape_page, error_projects)
        error_projects = []
        for f in features:
            if f[0] != None:
                projects.append(f)
            else:
                error_projects.append(f[1])

In [12]:
projects_df = pd.DataFrame(projects, columns=['Project URL', 
                                              'Sponsored',
                                              'Current Sponsors',
                                              'Past Sponsors',
                                              'Number of Watches',
                                              'Number of Workflow Runs',
                                              'Number of Dependents',
                                             ])

In [13]:
insight_df = pd.DataFrame(project_insight, columns=['Project URL', 'Active Pull Requests', 'Active Issues'])
prs_df = pd.DataFrame(project_prs, columns=['Project URL', 'Open Pull Requests', 'Closed Pull Requests'])
owner_df = pd.DataFrame(project_owner, columns=['Project URL', 'Verified Owner', 'Followers of Owner', 'Members of Owner', 'Repos of Owner'])
issues_df = pd.DataFrame(project_issues, columns=['Project URL', 'Open Issues', 'Closed Issues', 'Number of Labels', 'Number of Milestones',])
merged = pd.merge(insight_df, prs_df, on='Project URL', how='outer')
merged = pd.merge(merged, owner_df, on='Project URL', how='outer')
merged = pd.merge(merged, issues_df, on='Project URL', how='outer')
merged = pd.merge(merged, projects_df, on='Project URL', how='outer')

In [14]:
merged

Unnamed: 0,Project URL,Active Pull Requests,Active Issues,Open Pull Requests,Closed Pull Requests,Verified Owner,Followers of Owner,Members of Owner,Repos of Owner,Open Issues,Closed Issues,Number of Labels,Number of Milestones,Sponsored,Current Sponsors,Past Sponsors,Number of Watches,Number of Workflow Runs,Number of Dependents
0,https://github.com/521xueweihan/HelloGitHub,0,9,4,36,,8.4k,,,105,2497,29,0,Yes,10,0,2.9k,,19
1,https://github.com/996icu/996.ICU,0,0,3,1970,,8.2k,,,,,14,3,No,0,0,4.2k,,25
2,https://github.com/AUTOMATIC1111/stable-diffus...,38,42,14,2820,,12.9k,,,1955,5237,17,0,No,0,0,1k,9533,19
3,https://github.com/Chalarangelo/30-seconds-of-...,0,0,0,1604,,2.6k,,,0,294,29,0,No,0,0,2.6k,1187,108
4,https://github.com/CyC2018/CS-Notes,0,0,63,548,,14.4k,,,130,446,15,0,No,0,0,5.3k,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,https://github.com/vercel/next.js,117,97,412,20374,Verified,13.9k,67,142,2581,17187,76,1,No,0,0,1.4k,150535,2503814
96,https://github.com/vinta/awesome-python,1,0,399,1490,,7.4k,,,,,8,0,No,0,0,6k,,21
97,https://github.com/vuejs/vue,0,2,247,2265,Verified,14.5k,65,123,356,9659,36,0,No,0,0,5.9k,68,0
98,https://github.com/yangshun/tech-interview-han...,0,0,7,526,,10k,,,28,60,10,0,Yes,2,15,2k,26,18


try:
    with pd.ExcelWriter(
        "project_HTMLfeatures.xlsx",
        mode="a",
        engine="openpyxl",
        if_sheet_exists="overlay",
    ) as writer:
         projects_df.to_excel(writer,sheet_name="Sheet1", startrow=writer.sheets["Sheet1"].max_row, index = False,header= False)
except FileNotFoundError:
    projects_df.to_excel("project_HTMLfeatures.xlsx", index=False)