In [None]:
!pip install requests



Method for getting github stats: Watchers, forks and stars

In [None]:
import requests
import json
import time
from pprint import pprint

# Add your GitHub Personal Access Token (PAT) to make 5000 requests per hour, otherwise you're limited to 80 requests per hour
github_token = ""

def get_github_stats(repo_data):
    project_name, repo_url = repo_data
    owner_name = repo_url.split('/')[-2]
    repo_name = repo_url.split('/')[-1]
    api_url = f"https://api.github.com/repos/{owner_name}/{repo_name}"
    response = requests.get(api_url, headers={'Authorization': f'token {github_token}'})
    response_json = json.loads(response.text)
    # pprint(response_json)

    return (project_name, repo_url, response_json.get("language"), response_json.get("subscribers_count"), response_json.get("forks_count"), response_json.get("stargazers_count"))

In [None]:
print(get_github_stats(("express", "https://github.com/expressjs/express")))

('express', 'https://github.com/expressjs/express', 'JavaScript', 1767, 9277, 54908)


Method for getting releases for projects

In [None]:
def get_github_releases(repo_data):
    project_name, repo_url = repo_data
    owner_name = repo_url.split('/')[-2]
    repo_name = repo_url.split('/')[-1]
    api_url = f"https://api.github.com/repos/{owner_name}/{repo_name}/releases"
    print(api_url)
    response = requests.get(api_url, headers={'Authorization': f'token {github_token}'})
    response_json = json.loads(response.text)
    releases = []
    for release in response_json:
      releases.append((project_name, repo_url, release["tag_name"], release["html_url"]))

    return releases

Mounting datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Reading all NPM projects and extracting top 5000 sorted by dependent repositories count

In [None]:
import pandas as pd

filename = "/content/drive/Shareddrives/ECS260-group5/dataset/libraries.io/(NPM_extracted)projects-1.6.0-2020-01-12.csv"
test = pd.read_csv(filename, nrows=30)
display(test)

In [None]:
# Read rows from filename into a dataframe
# df = pd.read_csv(filename, nrows=50)
df = pd.read_csv(filename, low_memory=False)

In [None]:
# Extract those rows which have a non-empty (no NaN) "Repository URL" value
projects_with_repo_url = df[df["Repository URL"].notnull()]

# Filter out rows where the "Repository URL" is a valid URL
projects_with_valid_repo_url = projects_with_repo_url[(projects_with_repo_url["Repository URL"].str.startswith("https://github.com/"))]

# # Get only data under the "Project Name" & "Repository URL" column
# project_name_list = projects_with_valid_repo_url["Name"]
# repo_list = projects_with_valid_repo_url["Repository URL"]
# display(repo_list)

# TODO: get the top 10K projects_with_valid_repo_url which have the highest "Dependent Repositories Count"
projects_with_highest_dependents = projects_with_valid_repo_url.sort_values(by="Dependent Repositories Count", ascending=False)[:5000]
display(projects_with_highest_dependents)

# Get only data under the "Project Name" & "Repository URL" column
project_name_list = projects_with_highest_dependents["Name"]
repo_list = projects_with_highest_dependents["Repository URL"]
display(repo_list)

In [None]:
import concurrent.futures

dataset = []

# Concurrently get github stats for each repo url
with concurrent.futures.ThreadPoolExecutor() as executor:
  results = executor.map(get_github_stats,  zip(project_name_list, repo_list))
  dataset = [result for result in results]

display(dataset)

In [None]:
import numpy as np

column_names = ['Name', 'Repository URL', 'Language', 'Watchers', 'Forks', 'Stars']
data = np.array(dataset)

dataframe = pd.DataFrame(data, columns=column_names)
display(dataframe)

In [None]:
dataframe_without_none = dataframe[(dataframe["Language"].notnull() | dataframe["Watchers"].notnull() | dataframe["Forks"].notnull() | dataframe["Stars"].notnull())]
dataframe_without_none.Watchers = pd.to_numeric(dataframe_without_none.Watchers, errors='coerce')
dataframe_without_none.Stars = pd.to_numeric(dataframe_without_none.Stars, errors='coerce')
dataframe_without_none.Forks = pd.to_numeric(dataframe_without_none.Forks, errors='coerce')
display(dataframe_without_none)

dataframe_without_none.to_csv('/content/drive/Shareddrives/ECS260-group5/dataset/npm/(GitHub_Stats)(NPM_extracted)projects-1.6.0-2020-01-12.csv')

The following sections deal with extracting and plotting data for Watchers, Stars and Forks

In [None]:
#Extract Watchers
watchers = dataframe_without_none[["Name", "Repository URL", "Watchers"]]
sorted_watchers = watchers.sort_values(by="Watchers", ascending=False)
display(sorted_watchers)

sorted_watchers.to_csv('/content/drive/Shareddrives/ECS260-group5/dataset/npm/github_watchers_count.csv')

In [None]:
import matplotlib.pyplot as plt

plt.plot([x for x in range(len(sorted_watchers))], sorted_watchers["Watchers"])
plt.xlabel('Repo rank based on number of watchers')
plt.ylabel('Number of watchers')
plt.title('Watchers for the top ~5000 repos')
plt.show()
plt.close('all')

In [None]:
#Extract Stars
stars = dataframe_without_none[["Name", "Repository URL", "Stars"]]
sorted_stars = stars.sort_values(by="Stars", ascending=False)
display(sorted_stars)

sorted_stars.to_csv('/content/drive/Shareddrives/ECS260-group5/dataset/npm/github_stars_count.csv')

In [None]:
import matplotlib.pyplot as plt

plt.plot([x for x in range(len(sorted_stars))], sorted_stars["Stars"])
plt.xlabel('Repo rank based on number of stars')
plt.ylabel('Number of stars')
plt.title('Stars for the top ~5000 repos')
plt.show()
plt.close('all')

In [None]:
#Extract Forks
forks = dataframe_without_none[["Name", "Repository URL", "Forks"]]
sorted_forks = forks.sort_values(by='Forks', ascending=False)
display(sorted_forks)
sorted_forks.to_csv('/content/drive/Shareddrives/ECS260-group5/dataset/npm/github_forks_count.csv')

In [None]:
import matplotlib.pyplot as plt

plt.plot([x for x in range(len(sorted_forks))], sorted_forks["Forks"])
plt.xlabel('Repo rank based on number of forks')
plt.ylabel('Number of forks')
plt.title('Forks for the top ~5000 repos')
plt.show()
plt.close('all')

The following 2 sections deals with getting release versions for top 5000 packages

In [None]:
# Read GitHub_Stats for top_5k projects
github_stats_filename = "/content/drive/Shareddrives/ECS260-group5/dataset/npm/(GitHub_Stats)(NPM_extracted)projects-1.6.0-2020-01-12.csv"
# top_5k_projects = pd.read_csv(github_stats_filename, nrows=2, low_memory=False)
top_5k_projects = pd.read_csv(github_stats_filename, low_memory=False)

# take our name and repository url column
project_name_list = top_5k_projects["Name"]
repo_list = top_5k_projects["Repository URL"]
zipped_list = zip(project_name_list, repo_list)

# call get_github_releases for each project
import concurrent.futures
release_dataset = []
with concurrent.futures.ThreadPoolExecutor() as executor:
  results = executor.map(get_github_releases, zipped_list)
  # release_dataset = [release for result in results for release in result]
  for result in results:
    for release in result:
      release_dataset.append(release)

# display(release_dataset)

In [None]:
# write to csv Name, Repository URL, Release Name, Release URL
import numpy as np

release_column_names = ['Name', 'Repository URL', 'Release Name', 'Release URL']
release_data = np.array(release_dataset)

release_dataframe = pd.DataFrame(release_data, columns=release_column_names)
# display(release_dataframe)
release_dataframe.to_csv('/content/drive/Shareddrives/ECS260-group5/dataset/npm/github_releases.csv')

Calculating correlation coefficients for Watchers, Stars and Forks

In [None]:
import numpy as np

# Read GitHub_Stats for top_5k projects
github_stats_filename = "/content/drive/Shareddrives/ECS260-group5/dataset/npm/(GitHub_Stats)(NPM_extracted)projects-1.6.0-2020-01-12.csv"
# top_5k_projects = pd.read_csv(github_stats_filename, nrows=2, low_memory=False)
top_5k_projects_df = pd.read_csv(github_stats_filename, low_memory=False)

correlation_data = top_5k_projects_df[['Watchers', 'Forks', 'Stars']]

In [None]:
# standard correlation coefficient
correlation_data.corr(method='pearson')

Unnamed: 0,Watchers,Forks,Stars
Watchers,1.0,0.764951,0.914642
Forks,0.764951,1.0,0.788725
Stars,0.914642,0.788725,1.0


In [None]:
# Spearman rank correlation
correlation_data.corr(method='spearman')

Unnamed: 0,Watchers,Forks,Stars
Watchers,1.0,0.920165,0.908433
Forks,0.920165,1.0,0.962085
Stars,0.908433,0.962085,1.0


In [None]:
# Kendall Tau correlation coefficient
correlation_data.corr(method='kendall')

Unnamed: 0,Watchers,Forks,Stars
Watchers,1.0,0.834567,1.0
Forks,0.834567,1.0,0.834567
Stars,1.0,0.834567,1.0


Getting missing release stats for github projects

In [None]:
def get_github_tags(repo_data):
    project_name, repo_url = repo_data
    owner_name = repo_url.split('/')[-2]
    repo_name = repo_url.split('/')[-1]
    api_url = f"https://api.github.com/repos/{owner_name}/{repo_name}/tags"
    response = requests.get(api_url, headers={'Authorization': f'token {github_token}'})
    response_json = json.loads(response.text)
    # pprint(response_json)
    tags = []
    for tag in response_json:
      tags.append((project_name, repo_url, tag["name"], tag["commit"]["url"]))

    return tags

In [None]:
import pandas as pd

# Get GitHub stats for missing projects
missing_release_stats_filename = "/content/drive/Shareddrives/ECS260-group5/dataset/447_missing_package.csv"
# top_5k_projects = pd.read_csv(github_stats_filename, nrows=2, low_memory=False)
missing_release_projects = pd.read_csv(missing_release_stats_filename, low_memory=False)

# take our name and repository url column
missing_project_name_list = missing_release_projects["Name"]
missing_repo_list = missing_release_projects["Repository.URL"]
missing_zipped_list = zip(missing_project_name_list, missing_repo_list)

# call get_github_releases for each project
import concurrent.futures
missing_release_dataset = []
with concurrent.futures.ThreadPoolExecutor() as executor:
  missing_results = executor.map(get_github_tags, missing_zipped_list)
  for result in missing_results:
    for missing_release in result:
      missing_release_dataset.append(missing_release)

# display(release_dataset)

In [None]:
display(missing_release_dataset)

In [None]:
# write to csv Name, Repository URL, Release Name, Release URL
import numpy as np

release_column_names = ['Name', 'Repository URL', 'Release Name', 'Release URL']
missing_release_data = np.array(missing_release_dataset)

missing_release_dataframe = pd.DataFrame(missing_release_data, columns=release_column_names)
# display(release_dataframe)
missing_release_dataframe.to_csv('/content/drive/Shareddrives/ECS260-group5/dataset/missing_github_releases.csv')