In [1]:
#Import dependencies
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
"""
A module for obtaining repo readme and language data from the github API.
Before using this module, read through it, and follow the instructions marked
TODO.
After doing so, run it like this:
    python acquire.py
To create the `data.json` file that contains the data.
"""
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from env import github_token, github_username

# TODO: Make a github personal access token.
#     1. Go here and generate a personal access token https://github.com/settings/tokens
#        You do _not_ need select any scopes, i.e. leave all the checkboxes unchecked
#     2. Save it in your env.py file under the variable `github_token`
# TODO: Add your github username to your env.py file under the variable `github_username`
# TODO: Add more repositories to the `REPOS` list below.

REPOS = []

# REPOS = [
#     "gocodeup/codeup-setup-script",
#     "gocodeup/movies-application",
#     "torvalds/linux",
#     ....
# ]



headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data

In [3]:
#partial
def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        return repo_info.get("language", None)
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )

#Partial
def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )

# Partial
def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_contents = requests.get(get_readme_download_url(contents)).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data2.json", "w"), indent=1)

In [4]:
repo = 'inflationcoin/inflationcoin'

In [5]:
process_repo('inflationcoin/inflationcoin')

{'repo': 'inflationcoin/inflationcoin',
 'language': 'C++',
 'readme_contents': "InflationCoin - IFLT\r\n\r\nInflationCoin is an X11 PoW/PoS coin. It is a coin that integrated with true random super bonus block features. It uses high PoS as a way of distributing fairly the coins to community.\r\n\r\n- X11 hash algorithm, PoW/PoS mixed\r\n- 5 transaction confirmations\r\n- 70 minted block confirmations\r\n- Total coins will be 10 billions\r\n- The coin will be a pure PoS coin after 3 months of PoW mining.\r\n- A 2.5% premine for bounties and to fund servers and dev expenses\r\n\r\nPoW details:\r\n- 60 sec PoW block target time\r\n- difficulty retarget each block for PoW\r\n- Payout will be 1,000 coins per block\r\n- Every day there will be a random super block with 1000X normal payment (1,000,000 coins), it is true randomness and can't be taken advantage by big hashpowers\r\n- PoW will be terminated after 90 days.\r\n\r\nPoS details:\r\n- 60 sec PoS block time\r\n- diff retarget each bl

In [6]:
response = get('https://github.com/search?o=desc&p=2&q=inflation&s=forks&type=Repositories', headers=headers)

In [7]:
soup = BeautifulSoup(response.content, 'html.parser')

In [8]:
# <a class="v-align-middle" data-hydro-click="{&quot;event_type&quot;:&quot;search_result.click&quot;,&quot;payload&quot;:{&quot;page_number&quot;:2,&quot;per_page&quot;:10,&quot;query&quot;:&quot;inflation&quot;,&quot;result_position&quot;:3,&quot;click_id&quot;:72912101,&quot;result&quot;:{&quot;id&quot;:72912101,&quot;global_relay_id&quot;:&quot;MDEwOlJlcG9zaXRvcnk3MjkxMjEwMQ==&quot;,&quot;model_name&quot;:&quot;Repository&quot;,&quot;url&quot;:&quot;https://github.com/inflationcoin/inflationcoin&quot;},&quot;originating_url&quot;:&quot;https://github.com/search?o=desc&amp;p=2&amp;q=inflation&amp;s=forks&amp;type=Repositories&quot;,&quot;user_id&quot;:105242919}}" data-hydro-click-hmac="2f1f5bb56b209ce58e0f2aaf43b515e80a28028abd3302a8015c69aadbbabf16" href="/inflationcoin/inflationcoin">inflationcoin/<em>inflationcoin</em></a>


In [9]:
repo = soup.find('a', class_ = 'v-align-middle')
repo.text

'gwern/gwern.net'

In [10]:
repos = soup.find_all('a', class_ = 'v-align-middle')
repos[0].text

'gwern/gwern.net'

In [11]:
repo_list = []
for i in range(1,10):
    response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
    soup = BeautifulSoup(response.content, 'html.parser')
    for repo in soup.find_all('a', class_ = 'v-align-middle'):
        repo_list.append(repo.text)

In [12]:
len(repo_list) #should be 90, changes all the time

80

In [13]:
# repo_list_teen = []
# for i in range(10, 20):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list_teen.append(repo.text)

In [14]:
#len(repo_list_teen)

In [15]:
# repo_list_two = []
# for i in range(20, 30):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list_two.append(repo.text)

In [16]:
#len(repo_list_two)

In [17]:
# repo_list_three = []
# for i in range(30, 40):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list_three.append(repo.text)

In [18]:
# repo_list4 = []
# for i in range(40, 50):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list4.append(repo.text)

In [19]:
# repo_list5 = []
# for i in range(50, 60):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list5.append(repo.text)

In [20]:
# repo_list6 = []
# for i in range(60, 70):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list6.append(repo.text)

In [21]:
# repo_list7 = []
# for i in range(70, 80):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list7.append(repo.text)

In [22]:
# repo_list8 = []
# for i in range(80, 90):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list8.append(repo.text)

In [23]:
# repo_list9 = []
# for i in range(90, 100):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list9.append(repo.text)

In [24]:
# repo_list10 = []
# for i in range(100,101):
#     response = get('https://github.com/search?p={}&q=inflation&type=Repositories'.format(i))
#     soup = BeautifulSoup(response.content, 'html.parser')
#     for repo in soup.find_all('a', class_ = 'v-align-middle'):
#         repo_list10.append(repo.text)

In [25]:
#len(final)

In [26]:
repo_name = pd.read_csv('repo_name.csv')

In [27]:
repo_name

Unnamed: 0,repo_name
0,cashapp/InflationInject
1,InflationX/ViewPump
2,rdeits/iris-distro
3,uhussain/WebCrawlerForOnlineInflation
4,sandes/zipfly
...,...
879,millerngit/RateOfInflation
880,iamdingkai/what-s-driving-inflation
881,jrbsn/Inflation-Deep-Learning
882,fscheler/Inflation_Tracker_World


In [49]:
repo_name[repo_name.repo_name == ' ']

Unnamed: 0,repo_name


In [28]:
rep_nam_list = repo_name.repo_name.to_list()

In [40]:
short_list = rep_nam_list[:10]

In [41]:
short_list

['cashapp/InflationInject',
 'InflationX/ViewPump',
 'rdeits/iris-distro',
 'uhussain/WebCrawlerForOnlineInflation',
 'sandes/zipfly',
 'B3nedikt/ViewPump',
 'inflationcoin/inflationcoin',
 'anishsingh20/Time-series-analysis-of-Inflation-rates-using-ShinyDashboard',
 'stream-utils/inflation',
 'palewire/cpi']

In [42]:
info = []
for n in short_list:
    l = process_repo(n)
    info.append(l)

In [43]:
info

[{'repo': 'cashapp/InflationInject',
  'language': 'Kotlin',
  'readme_contents': '# Inflation Injection\n\nConstructor-inject views during XML layout inflation.\n\nLooking for Assisted Inject? It\'s [built in to Dagger now](https://dagger.dev/dev-guide/assisted-injection.html)!\n\n\n## Usage\n\nWrite your layout XML like normal.\n\n```xml\n<LinearLayout>\n  <com.example.CustomView/>\n  <TextView/>\n</LinearLayout>\n```\n\nUse `@InflationInject` in `CustomView`:\n\n```java\npublic final class CustomView extends View {\n  private final Picasso picasso;\n  \n  @InflationInject\n  public CustomView(\n    @Inflated Context context,\n    @Inflated AttributeSet attrs,\n    Picasso picasso\n  ) {\n    super(context, attrs);\n    this.picasso = picasso;\n  }\n  \n  // ...\n}\n```\n\nIn order to allow Dagger to create your custom views, add `@InflationModule` to a Dagger module and\nadd the generated module name to its `includes=`.\n\n```java\n@InflationModule\n@Module(includes = InflationInjec

In [44]:
med_list = rep_nam_list[:100]

In [46]:
info_m = []
for n in med_list:
    l = process_repo(n)
    info_m.append(l)

MissingSchema: Invalid URL '': No scheme supplied. Perhaps you meant http://?