In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
from bs4 import BeautifulSoup
from env import github_token, github_username
import re
import pandas as pd
import time
from requests import get

In [2]:
import unicodedata
import re
import json
# nltk, tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
# pandas dataframe manipulation, acquire script, time formatting
import pandas as pd
import acquire
from time import strftime
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
urls_repo = pd.read_csv('urls.csv', index_col=0)
urls_repo['0']

0                  freeCodeCamp/freeCodeCamp
1                             996icu/996.ICU
2     EbookFoundation/free-programming-books
3        jwasham/coding-interview-university
4                       sindresorhus/awesome
                       ...                  
95                   public-apis/public-apis
96                                 vuejs/vue
97                            facebook/react
98          codecrafters-io/build-your-own-x
99                     tensorflow/tensorflow
Name: 0, Length: 100, dtype: object

In [4]:
"""
A module for obtaining repo readme and language data from the github API.
Before using this module, read through it, and follow the instructions marked
TODO.
After doing so, run it like this:
    python acquire.py
To create the `data.json` file that contains the data.
"""


# TODO: Make a github personal access token.
#     1. Go here and generate a personal access token: https://github.com/settings/tokens
#        You do _not_ need select any scopes, i.e. leave all the checkboxes unchecked
#     2. Save it in your env.py file under the variable `github_token`
# TODO: Add your github username to your env.py file under the variable `github_username`
# TODO: Add more repositories to the `REPOS` list below.

REPOS = urls_repo['0']

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data

def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""

def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


#if __name__ == "__main__":
#    data = scrape_github_data()
#    json.dump(data, open("data.json", "w"), indent=1)

In [5]:
scrape_df = scrape_github_data()

In [6]:
import unicodedata

In [7]:
url = "https://github.com/search?q=stars%3A%3E0&s=stars&type=Repositories"
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a',class_="v-align-middle"):
    print(link.get('href'))

/go-xorm/xorm
/mantl/mantl
/addyosmani/es6-equivalents-in-es5
/AlanQuatermain/AQGridView
/loopj/jquery-tokeninput
/brianleroux/lawnchair
/androidquery/androidquery
/tobegit3hub/tensorflow_template_application
/tomknig/TOMSMorphingLabel
/CezaryKopacz/CKWaveCollectionViewTransition


In [47]:
df = pd.DataFrame(scrape_df)
df

Unnamed: 0,repo,language,readme_contents
0,freeCodeCamp/freeCodeCamp,TypeScript,[![freeCodeCamp Social Banner](https://s3.amaz...
1,996icu/996.ICU,,[996.ICU](https://996.icu/#/en_US)\n=======\n*...
2,EbookFoundation/free-programming-books,,# List of Free Learning Resources In Many Lang...
3,jwasham/coding-interview-university,,# Coding Interview University\n\n> I originall...
4,sindresorhus/awesome,,"<div align=""center"">\n\t<a href=""https://vshym..."
...,...,...,...
95,public-apis/public-apis,Python,"<div align=""center"">\n <h1>Public APIs</h1>..."
96,vuejs/vue,TypeScript,"<p align=""center""><a href=""https://vuejs.org"" ..."
97,facebook/react,JavaScript,# [React](https://reactjs.org/) &middot; [![Gi...
98,codecrafters-io/build-your-own-x,,[![Banner](https://codecrafters.io/landing/ima...


In [55]:
df.readme_contents[0]

"[![freeCodeCamp Social Banner](https://s3.amazonaws.com/freecodecamp/wide-social-banner.png)](https://www.freecodecamp.org/)\n\n[![Pull Requests Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat)](http://makeapullrequest.com)\n[![first-timers-only Friendly](https://img.shields.io/badge/first--timers--only-friendly-blue.svg)](http://www.firsttimersonly.com/)\n[![Open Source Helpers](https://www.codetriage.com/freecodecamp/freecodecamp/badges/users.svg)](https://www.codetriage.com/freecodecamp/freecodecamp)\n[![Setup Automated](https://img.shields.io/badge/setup-automated-blue?logo=gitpod)](https://gitpod.io/from-referrer/)\n[![Discord](https://img.shields.io/discord/692816967895220344)](https://discord.gg/PRyKn3Vbay)\n\n## freeCodeCamp.org's open-source codebase and curriculum\n\n[freeCodeCamp.org](https://www.freecodecamp.org) is a friendly community where you can learn to code for free. It is run by a [donor-supported 501(c)(3) charity](https://www.freecode

In [45]:
df = df['readme_contents'].astype(str)

KeyError: 'readme_contents'

In [27]:
df = df.readme_contents.to_string()
df



In [28]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    # we will normalize our data into standard NFKD unicode, feed it into an ascii encoding
    # decode it back into UTF-8
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # utilize our regex substitution to remove our undesirable characters, then lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
    return string

In [29]:
df = basic_clean(df)
df

"0     freecodecamp social bannerhttpss3amaz\n1     996icuhttps996icuen_usnn\n2      list of free learning resources in many lang\n3      coding interview universitynn i originall\n4     div aligncenternta hrefhttpsvshym\n5     p aligncentern  img srcpublicimages\n6     englishreadmemd  readmejamd  \n7      reacthttpsreactjsorg middot gi\n8     bannerhttpscodecraftersiolandingima\n9      you don't know js yet book series  2nd ed\n10    freecodecamp social bannerhttpss3amaz\n11     list of free learning resources in many lang\n12    div aligncenternta hrefhttpsvshym\n13    div aligncentern    h1public apish1\n14    englishreadmemd  readmejamd  \n15    p aligncentera hrefhttpsvuejsorg \n16    bannerhttpscodecraftersiolandingima\n17    div aligncentern  img srchttpswww\n18     you don't know js yet book series  2nd ed\n19    p aligncentern  a hrefhttpsgetboot\n20    freecodecamp social bannerhttpss3amaz\n21    996icuhttps996icuen_usnn\n22     coding interview universitynn i originall\n23 

In [31]:
series = pd.Series(df).value_counts()
series

0     freecodecamp social bannerhttpss3amaz\n1     996icuhttps996icuen_usnn\n2      list of free learning resources in many lang\n3      coding interview universitynn i originall\n4     div aligncenternta hrefhttpsvshym\n5     p aligncentern  img srcpublicimages\n6     englishreadmemd  readmejamd  \n7      reacthttpsreactjsorg middot gi\n8     bannerhttpscodecraftersiolandingima\n9      you don't know js yet book series  2nd ed\n10    freecodecamp social bannerhttpss3amaz\n11     list of free learning resources in many lang\n12    div aligncenternta hrefhttpsvshym\n13    div aligncentern    h1public apish1\n14    englishreadmemd  readmejamd  \n15    p aligncentera hrefhttpsvuejsorg \n16    bannerhttpscodecraftersiolandingima\n17    div aligncentern  img srchttpswww\n18     you don't know js yet book series  2nd ed\n19    p aligncentern  a hrefhttpsgetboot\n20    freecodecamp social bannerhttpss3amaz\n21    996icuhttps996icuen_usnn\n22     coding interview universitynn i originall\n23  

In [14]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # make our tokenizer, taken from nltk's ToktokTokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # apply our tokenizer's tokenization to the string being input, ensure it returns a string
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [15]:
df = tokenize(df)
df

"0 freecodecamp social bannerhttpss3amaz\n1 996icuhttps996icuen_usnn\n2 list of free learning resources in many lang\n3 coding interview universitynn i originall\n4 div aligncenternta hrefhttpsvshym\n5 p aligncentern img srcpublicimages\n6 englishreadmemd readmejamd \n7 reacthttpsreactjsorg middot gi\n8 bannerhttpscodecraftersiolandingima\n9 you don ' t know js yet book series 2nd ed\n10 freecodecamp social bannerhttpss3amaz\n11 list of free learning resources in many lang\n12 div aligncenternta hrefhttpsvshym\n13 div aligncentern h1public apish1\n14 englishreadmemd readmejamd \n15 p aligncentera hrefhttpsvuejsorg \n16 bannerhttpscodecraftersiolandingima\n17 div aligncentern img srchttpswww\n18 you don ' t know js yet book series 2nd ed\n19 p aligncentern a hrefhttpsgetboot\n20 freecodecamp social bannerhttpss3amaz\n21 996icuhttps996icuen_usnn\n22 coding interview universitynn i originall\n23 p aligncentern img srcpublicimages\n24 p aligncentera hrefhttpsvuejsorg \n25 div aligncentern 

In [16]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # create our stemming object
    ps = nltk.porter.PorterStemmer()
    # use a list comprehension => stem each word for each word inside of the entire document,
    # split by the default, which are single spaces
    stems = [ps.stem(word) for word in string.split()]
    # glue it back together with spaces, as it was before
    string = ' '.join(stems)
    
    return string

In [17]:
df = stem(df)
df

"0 freecodecamp social bannerhttpss3amaz 1 996icuhttps996icuen_usnn 2 list of free learn resourc in mani lang 3 code interview universitynn i original 4 div aligncenternta hrefhttpsvshym 5 p aligncentern img srcpublicimag 6 englishreadmemd readmejamd 7 reacthttpsreactjsorg middot gi 8 bannerhttpscodecraftersiolandingima 9 you don ' t know js yet book seri 2nd ed 10 freecodecamp social bannerhttpss3amaz 11 list of free learn resourc in mani lang 12 div aligncenternta hrefhttpsvshym 13 div aligncentern h1public apish1 14 englishreadmemd readmejamd 15 p aligncentera hrefhttpsvuejsorg 16 bannerhttpscodecraftersiolandingima 17 div aligncentern img srchttpswww 18 you don ' t know js yet book seri 2nd ed 19 p aligncentern a hrefhttpsgetboot 20 freecodecamp social bannerhttpss3amaz 21 996icuhttps996icuen_usnn 22 code interview universitynn i original 23 p aligncentern img srcpublicimag 24 p aligncentera hrefhttpsvuejsorg 25 div aligncentern img srchttpswww 26 you don ' t know js yet book seri 

In [18]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # create our lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use a list comprehension to lemmatize each word
    # string.split() => output a list of every token inside of the document
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # glue the lemmas back together by the strings we split on
    string = ' '.join(lemmas)
    #return the altered document
    return string

In [19]:
df = lemmatize(df)
df

"0 freecodecamp social bannerhttpss3amaz 1 996icuhttps996icuen_usnn 2 list of free learn resourc in mani lang 3 code interview universitynn i original 4 div aligncenternta hrefhttpsvshym 5 p aligncentern img srcpublicimag 6 englishreadmemd readmejamd 7 reacthttpsreactjsorg middot gi 8 bannerhttpscodecraftersiolandingima 9 you don ' t know j yet book seri 2nd ed 10 freecodecamp social bannerhttpss3amaz 11 list of free learn resourc in mani lang 12 div aligncenternta hrefhttpsvshym 13 div aligncentern h1public apish1 14 englishreadmemd readmejamd 15 p aligncentera hrefhttpsvuejsorg 16 bannerhttpscodecraftersiolandingima 17 div aligncentern img srchttpswww 18 you don ' t know j yet book seri 2nd ed 19 p aligncentern a hrefhttpsgetboot 20 freecodecamp social bannerhttpss3amaz 21 996icuhttps996icuen_usnn 22 code interview universitynn i original 23 p aligncentern img srcpublicimag 24 p aligncentera hrefhttpsvuejsorg 25 div aligncentern img srchttpswww 26 you don ' t know j yet book seri 2nd

In [20]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # assign our stopwords from nltk into stopword_list
    stopword_list = stopwords.words('english')
    # utilizing set casting, i will remove any excluded stopwords
    stopword_list = set(stopword_list) - set(exclude_words)
    # add in any extra words to my stopwords set using a union
    stopword_list = stopword_list.union(set(extra_words))
    # split our document by spaces
    words = string.split()
    # every word in our document, as long as that word is not in our stopwords
    filtered_words = [word for word in words if word not in stopword_list]
    # glue it back together with spaces, as it was so it shall be
    string_without_stopwords = ' '.join(filtered_words)
    # return the document back
    return string_without_stopwords

In [21]:
df = remove_stopwords(df)
df

"0 freecodecamp social bannerhttpss3amaz 1 996icuhttps996icuen_usnn 2 list free learn resourc mani lang 3 code interview universitynn original 4 div aligncenternta hrefhttpsvshym 5 p aligncentern img srcpublicimag 6 englishreadmemd readmejamd 7 reacthttpsreactjsorg middot gi 8 bannerhttpscodecraftersiolandingima 9 ' know j yet book seri 2nd ed 10 freecodecamp social bannerhttpss3amaz 11 list free learn resourc mani lang 12 div aligncenternta hrefhttpsvshym 13 div aligncentern h1public apish1 14 englishreadmemd readmejamd 15 p aligncentera hrefhttpsvuejsorg 16 bannerhttpscodecraftersiolandingima 17 div aligncentern img srchttpswww 18 ' know j yet book seri 2nd ed 19 p aligncentern hrefhttpsgetboot 20 freecodecamp social bannerhttpss3amaz 21 996icuhttps996icuen_usnn 22 code interview universitynn original 23 p aligncentern img srcpublicimag 24 p aligncentera hrefhttpsvuejsorg 25 div aligncentern img srchttpswww 26 ' know j yet book seri 2nd ed 27 div aligncenterrn hrefhttpsw 28 nn 29 p a

In [41]:
word_counts = (pd.concat(all, axis=1, sort=True)
                .set_axis('all'), axis==1, inplace==False)
                .fillna(0)
                    .apply(lambda s: s.astype(int)))

word_counts.head()

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 4)