In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
from bs4 import BeautifulSoup
import threading
from concurrent.futures import ThreadPoolExecutor

In [2]:
# Declare list to be used to store projects that failed to collect any information
error_projects = []

In [3]:
# Potential Scrapes:
# ../issues => Open/closed issues, # of labels, # of milestones
# ../pulls
# ../actions => # of workflow runs
# ../pulse => Active pr and Active issues
# ../network/dependencies
# Sponsered
# Watches
# 
# Ones in Bash Script:
# Commits
# Tags/Releases
# Branches
# Languages
# Contributors

# Headless mode for Chrome
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-renderer-backgrounding")
chrome_options.add_argument("--disable-background-timer-throttling")
chrome_options.add_argument("--disable-backgrounding-occluded-windows")
chrome_options.add_argument("--disable-client-side-phishing-detection")
chrome_options.add_argument("--disable-crash-reporter")
chrome_options.add_argument("--disable-oopr-debug-crash-dump")
chrome_options.add_argument("--no-crash-upload")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-low-res-tiling")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--silent")
chrome_options.add_argument("--blink-settings=imagesEnabled=false")

def scrape_page(project_url):

    project_features = []
    
    print(project_url)
    # Add url to list
    project_features.append(project_url)
    
    # Get the OWNER/REPO
    project = project_url[19:]
    print(project)

    # Set up Web Driver
    driver = webdriver.Chrome(options=chrome_options)

    driver.get(project_url)

    # Get number of watches and sponsors
    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )
    
    # Parse HTML
    # Get number of watches and sponsered?
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    
    num_watches = soup.find(href=f"/{project}/watchers").find("strong").text
    
    creator = project.split('/')[0]
    sponsored = "Yes" if soup.find(href=f"/sponsors/{creator}") != None else "No"

    project_features.append(num_watches)
    project_features.append(sponsored)
    
    # Issues
    #thread = threading.Thread(name=watch_sponsors,target=issues(project_url, project, driver))
    #thread.start()
    issue_url = project_url + "/issues"
    driver.get(issue_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )
    
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")

    open_issues = None if soup.find(href=f"/{project}/issues?q=is%3Aopen+is%3Aissue") == None else soup.find(href=f"/{project}/issues?q=is%3Aopen+is%3Aissue").text.split()[0]
    closed_issues = None if soup.find(href=f"/{project}/issues?q=is%3Aissue+is%3Aclosed") == None else soup.find(href=f"/{project}/issues?q=is%3Aissue+is%3Aclosed").text.split()[0]
    num_labels = None if soup.find(href=f"/{project}/labels") == None else soup.find(href=f"/{project}/labels").find("span").text
    num_milestones = None if soup.find(href=f"/{project}/milestones") == None else soup.find(href=f"/{project}/milestones").find("span").text


    project_features.append(open_issues)
    project_features.append(closed_issues)
    project_features.append(num_labels)
    project_features.append(num_milestones)

    print(f"Project:{project_url}, Open issues: {open_issues}, Closed issues: {closed_issues}")

    # Pull Requests
    pull_url = project_url + "/pulls"
    driver.get(pull_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )
        
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")

    #print(f"Getting prs: {project}")
    #open_prs = soup.find(href=f"/{project}/pulls?q=is%3Aopen+is%3Apr")
    #if open_prs != None:
     #   open_prs = open_prs.text.split()[0]
    #else:
     #   count = 0
      #  while count < 4 and open_prs == None:
       #     driver.get(pull_url)
        #    # Wait for the document to be in 'complete' state
         #   WebDriverWait(driver, 10).until(
         #       EC.visibility_of_element_located((By.TAG_NAME, 'body'))
          #  )    
          #  html = driver.page_source
           # soup = BeautifulSoup(html, "html.parser")
           # open_prs = soup.find(href=f"/{project}/pulls?q=is%3Aopen+is%3Apr")
           # count += 1
       # if open_prs == None:
       #     return [None, project_url]
       # else:
        #    open_prs = open_prs.text.split()[0]
    #closed_prs = soup.find(href=f"/{project}/pulls?q=is%3Apr+is%3Aclosed").text.split()[0]

    #print(f"Getting prs: {project} Open={open_prs} Closed={closed_prs}")
    
    #project_features.append(open_prs)
    #project_features.append(closed_prs)

    # Number of Workflow Runs
    workflow_url = project_url + "/actions"
    driver.get(workflow_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    workflow = soup.find(lambda tag: tag.name == 'strong' and 'workflow runs' in tag.get_text())
    if workflow != None:
        workflow = workflow.text.split()[0]
    print(f"workflow: {workflow}")

    project_features.append(workflow)
    
    # Number of Dependent Repos
    dependent_url = project_url + "/network/dependents"
    driver.get(dependent_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    dependents = soup.find('a', class_='btn-link selected')
    if dependents != None:
        dependents = dependents.text.split()[0]

    print(f"dependents: {dependents}")
    
    project_features.append(dependents)
    
    # Verified Repo Owner
    owner_url = f"https://github.com/{creator}"
    print(owner_url)
    driver.get(owner_url)

    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.TAG_NAME, 'body'))
    )    

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    verified = soup.find('summary', {'title': 'Label: Verified'})
    if verified != None:
        verified = verified.text.split()[0]
        
    print(verified)
    project_features.append(verified)
    
    # Number of Owner Followers
    #followers = soup.find('a', class_='Link--secondary no-underline no-wrap')
    #if followers != None:
      #  followers = followers.text.split()[0]
   # else:
      #  count = 0
       # while count < 4 and followers == None:
        #    driver.get(owner_url)
         #   # Wait for the document to be in 'complete' state
          #  WebDriverWait(driver, 10).until(
           #     EC.visibility_of_element_located((By.TAG_NAME, 'body'))
            #)    
           # html = driver.page_source
           # soup = BeautifulSoup(html, "html.parser")
           # followers = soup.find('a', class_='Link--secondary no-underline no-wrap')
           # count += 1
        #if followers == None:
       #     return [None, project_url]
        #else:
         #   followers = followers.text.split()[0]
    #print(f"followers: {followers}")

    #project_features.append(followers)
    
    # Clean close the Web Session and window(s)
    driver.quit()

    return project_features

In [4]:
project_list = pd.read_excel('project_5000Up.xlsx')

In [5]:
project_list = project_list['Project URL'].tolist()
project_list = project_list[:100]
#project_list = ["https://github.com/spring-attic/spring-mvc-showcase","https://github.com/twbs/bootstrap", "https://github.com/freeCodeCamp/freeCodeCamp", "https://github.com/geekyutao/Inpaint-Anything", "https://github.com/raspberrypi/firmware", "https://github.com/negomi/react-burger-menu", "https://github.com/yyhsong/iDataV", "https://github.com/yipianfengye/android-zxingLibrary", "https://github.com/nikic/FastRoute", "https://github.com/vercel/platforms", "https://github.com/thinkingjimmy/Learning-Prompt"]
#project_list = ["https://github.com/spring-attic/spring-mvc-showcase","https://github.com/twbs/bootstrap"]




In [6]:
#df = pd.read_excel('projects.xlsx') # can also index sheet by name or fetch all sheets
#project_list = df['Project'].tolist()
#project_list = ["https://github.com/twbs/bootstrap", "https://github.com/freeCodeCamp/freeCodeCamp"]
#for p in project_list[:10]:
#    scrape_page(p)
projects = []
#for project in project_list:
#    result = scrape_page(project)
#    projects.append(result)
low = 0
high = 10
with ThreadPoolExecutor(max_workers=10) as p:
    while high <= len(project_list):
        features = p.map(scrape_page, project_list[low:high])
        print(high)
        for f in features:
            if f[0] != None:
                projects.append(f)
            else:
                error_projects.append(f[1])

        low += 10
        high += 10
        time.sleep(2)

https://github.com/freeCodeCamp/freeCodeCamphttps://github.com/EbookFoundation/free-programming-books
EbookFoundation/free-programming-books

freeCodeCamp/freeCodeCamp
https://github.com/sindresorhus/awesome
sindresorhus/awesome
https://github.com/public-apis/public-apis
public-apis/public-apis
https://github.com/jwasham/coding-interview-university
jwasham/coding-interview-university
https://github.com/996icu/996.ICU
996icu/996.ICU
https://github.com/kamranahmedse/developer-roadmap
kamranahmedse/developer-roadmap
https://github.com/donnemartin/system-design-primer
donnemartin/system-design-primer
https://github.com/codecrafters-io/build-your-own-x
codecrafters-io/build-your-own-x
https://github.com/facebook/react
facebook/react
10
Project:https://github.com/996icu/996.ICU, Open issues: None, Closed issues: None
Project:https://github.com/EbookFoundation/free-programming-books, Open issues: 26, Closed issues: 1,039
Project:https://github.com/jwasham/coding-interview-university, Open iss

In [7]:
len(projects)

100

In [8]:
#print(len(repo_url))
#rint(len(repo_watches))
#print(len(repo_sponsors))
#print(len(repo_open_issues))
#print(len(repo_closed_issues))
#print(len(repo_labels))
#print(len(repo_milestones))
#print(len(repo_open_prs))
#print(len(repo_closed_prs))

In [9]:
while len(error_projects) != 0:
    print(len(error_projects))
    with ThreadPoolExecutor(max_workers=10) as p:
        features = p.map(scrape_page, error_projects)
        error_projects = []
        for f in features:
            if f[0] != None:
                projects.append(f)
            else:
                error_projects.append(f[1])

In [10]:
error_projects

[]

In [11]:

projects_df = pd.DataFrame(projects, columns=['Project URL', 
                                              'Number of Watches',
                                              'Sponsored',
                                              'Open Issues',
                                              'Closed Issues',
                                              'Number of Labels',
                                              'Number of Milestones',
                                              #'Open Pull Requests',
                                              #'Closed Pull Requests',
                                              'Number of Workflow Runs',
                                              'Number of Dependents',
                                              'Verified Owner',
                                              #'Followers of Owner'
                                             ])


In [12]:
projects_df

Unnamed: 0,Project URL,Number of Watches,Sponsored,Open Issues,Closed Issues,Number of Labels,Number of Milestones,Number of Workflow Runs,Number of Dependents,Verified Owner
0,https://github.com/freeCodeCamp/freeCodeCamp,8.5k,Yes,269,17431,44,4,132274,25,Verified
1,https://github.com/EbookFoundation/free-progra...,9.7k,Yes,26,1039,73,0,15468,21,
2,https://github.com/sindresorhus/awesome,7.6k,Yes,22,312,8,0,257,24,
3,https://github.com/public-apis/public-apis,4.1k,No,12,548,21,0,504,2,
4,https://github.com/jwasham/coding-interview-un...,8.6k,Yes,45,369,12,0,14,20,
...,...,...,...,...,...,...,...,...,...,...
95,https://github.com/florinpop17/app-ideas,1.8k,No,103,41,7,0,,19,
96,https://github.com/doocs/advanced-java,2.6k,No,3,141,13,0,931,0,
97,https://github.com/FortAwesome/Font-Awesome,1.4k,No,5461,13790,135,8,,716273,Verified
98,https://github.com/spring-projects/spring-boot,3.4k,No,,,,,5223,19,Verified


try:
    with pd.ExcelWriter(
        "project_HTMLfeatures.xlsx",
        mode="a",
        engine="openpyxl",
        if_sheet_exists="overlay",
    ) as writer:
         projects_df.to_excel(writer,sheet_name="Sheet1", startrow=writer.sheets["Sheet1"].max_row, index = False,header= False)
except FileNotFoundError:
    projects_df.to_excel("project_HTMLfeatures.xlsx", index=False)