In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
from env import github_token, github_username
import unicodedata
import nltk

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# CodeUp-DS-NLP-Project
 
### Project Goals 
* The goal of this classification project is to first identify key words of the programming language and create a machine learning model that can effectly predict the programming language used.
### The Plan
* Aquire ReadMe data from GitHub repositories via webscraping.
* Prepare data for exploration by:
    * Convert text to all lower case for normalcy.
    * Remove any accented characters, non-ASCII characters.
    * Remove special characters.
    * Stem or lemmatize the words.
    * Remove stopwords.
    * Store the clean text and the original text for use in future notebooks.
#### Explore data in search of key features with the basic following questions:
* What are the most common words in READMEs?
* Does the length of the README vary by programming language?
* Do different programming languages use a different number of unique words?
* Are there any words that uniquely identify a programming language?
#### Develop a Model to predict happiness score
* Use key words identified to build predictive models of different types
* Evaluate models on train and validate data samples
* Select the best model based on accuracy
* Evaluate the best model on test data samples
#### Draw conclusions

### Steps to Reproduce
* Clone this repo.
* Acquire the data from GitHub
* Put the data in the file containing the cloned repo.
* Run notebook
### Conclusions
* 
#### Key TakeAway:
*
### Recommendations
*

In [2]:

url = "https://github.com/search?3&q=stars%3A%3E0&s=stars&type=Repositories"
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a',class_="v-align-middle"):
    print(link.get('href'))

/public-apis/public-apis
/CyC2018/CS-Notes
/trekhleb/javascript-algorithms
/ohmyzsh/ohmyzsh
/TheAlgorithms/Python
/flutter/flutter
/torvalds/linux
/github/gitignore
/golang/go
/30-seconds/30-seconds-of-code


In [3]:
# remember the lesson that Rosy showed you
#for i in range(1,50):
#    print(i)
#    time.sleep(10)

In [4]:

url = "https://github.com/search?3&q=stars%3A%3E0&s=stars&type=Repositories"
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a',class_="v-align-middle"):
    urls.append(link.get('href'))

In [5]:
urls_repo = pd.read_csv('urls.csv', index_col=0)
urls_repo['0']

0                  freeCodeCamp/freeCodeCamp
1                             996icu/996.ICU
2     EbookFoundation/free-programming-books
3        jwasham/coding-interview-university
4                       sindresorhus/awesome
                       ...                  
95                   public-apis/public-apis
96                                 vuejs/vue
97                            facebook/react
98          codecrafters-io/build-your-own-x
99                     tensorflow/tensorflow
Name: 0, Length: 100, dtype: object

In [6]:
REPOS = urls_repo['0']

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data.json", "w"), indent=1)

In [7]:
urls_df = scrape_github_data()

In [8]:
df=pd.DataFrame(urls_df)

In [9]:
df

Unnamed: 0,repo,language,readme_contents
0,freeCodeCamp/freeCodeCamp,TypeScript,[![freeCodeCamp Social Banner](https://s3.amaz...
1,996icu/996.ICU,,[996.ICU](https://996.icu/#/en_US)\n=======\n*...
2,EbookFoundation/free-programming-books,,# List of Free Learning Resources In Many Lang...
3,jwasham/coding-interview-university,,# Coding Interview University\n\n> I originall...
4,sindresorhus/awesome,,"<div align=""center"">\n\t<a href=""https://vshym..."
...,...,...,...
95,public-apis/public-apis,Python,"<div align=""center"">\n <h1>Public APIs</h1>..."
96,vuejs/vue,TypeScript,"<p align=""center""><a href=""https://vuejs.org"" ..."
97,facebook/react,JavaScript,# [React](https://reactjs.org/) &middot; [![Gi...
98,codecrafters-io/build-your-own-x,,[![Banner](https://codecrafters.io/landing/ima...


In [23]:
df[['readme_contents']].to_csv('rmtext.csv')

In [10]:
df['text'] = df.readme_contents.to_string()
df

Unnamed: 0,repo,language,readme_contents,text
0,freeCodeCamp/freeCodeCamp,TypeScript,[![freeCodeCamp Social Banner](https://s3.amaz...,0 [![freeCodeCamp Social Banner](https://s...
1,996icu/996.ICU,,[996.ICU](https://996.icu/#/en_US)\n=======\n*...,0 [![freeCodeCamp Social Banner](https://s...
2,EbookFoundation/free-programming-books,,# List of Free Learning Resources In Many Lang...,0 [![freeCodeCamp Social Banner](https://s...
3,jwasham/coding-interview-university,,# Coding Interview University\n\n> I originall...,0 [![freeCodeCamp Social Banner](https://s...
4,sindresorhus/awesome,,"<div align=""center"">\n\t<a href=""https://vshym...",0 [![freeCodeCamp Social Banner](https://s...
...,...,...,...,...
95,public-apis/public-apis,Python,"<div align=""center"">\n <h1>Public APIs</h1>...",0 [![freeCodeCamp Social Banner](https://s...
96,vuejs/vue,TypeScript,"<p align=""center""><a href=""https://vuejs.org"" ...",0 [![freeCodeCamp Social Banner](https://s...
97,facebook/react,JavaScript,# [React](https://reactjs.org/) &middot; [![Gi...,0 [![freeCodeCamp Social Banner](https://s...
98,codecrafters-io/build-your-own-x,,[![Banner](https://codecrafters.io/landing/ima...,0 [![freeCodeCamp Social Banner](https://s...


In [35]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    # we will normalize our data into standard NFKD unicode, feed it into an ascii encoding
    # decode it back into UTF-8
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # utilize our regex substitution to remove our undesirable characters, then lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
    return string

In [36]:
#inshort_df[‘clean_text’] = inshort_df.content.apply(clean).apply(' ’.join)

In [40]:
df['clean_text']= df.readme_contents.apply(basic_clean)

In [41]:
df

Unnamed: 0,repo,language,readme_contents,text,clean_text
0,freeCodeCamp/freeCodeCamp,TypeScript,[![freeCodeCamp Social Banner](https://s3.amaz...,0 [![freeCodeCamp Social Banner](https://s...,freecodecamp social bannerhttpss3amazonawscomf...
1,996icu/996.ICU,,[996.ICU](https://996.icu/#/en_US)\n=======\n*...,0 [![freeCodeCamp Social Banner](https://s...,996icuhttps996icuen_us\n\nplease note that the...
2,EbookFoundation/free-programming-books,,# List of Free Learning Resources In Many Lang...,0 [![freeCodeCamp Social Banner](https://s...,list of free learning resources in many langu...
3,jwasham/coding-interview-university,,# Coding Interview University\n\n> I originall...,0 [![freeCodeCamp Social Banner](https://s...,coding interview university\n\n i originally ...
4,sindresorhus/awesome,,"<div align=""center"">\n\t<a href=""https://vshym...",0 [![freeCodeCamp Social Banner](https://s...,div aligncenter\n\ta hrefhttpsvshymanskyygithu...
...,...,...,...,...,...
95,public-apis/public-apis,Python,"<div align=""center"">\n <h1>Public APIs</h1>...",0 [![freeCodeCamp Social Banner](https://s...,div aligncenter\n h1public apish1\n ia c...
96,vuejs/vue,TypeScript,"<p align=""center""><a href=""https://vuejs.org"" ...",0 [![freeCodeCamp Social Banner](https://s...,p aligncentera hrefhttpsvuejsorg target_blank ...
97,facebook/react,JavaScript,# [React](https://reactjs.org/) &middot; [![Gi...,0 [![freeCodeCamp Social Banner](https://s...,reacthttpsreactjsorg middot github licensehtt...
98,codecrafters-io/build-your-own-x,,[![Banner](https://codecrafters.io/landing/ima...,0 [![freeCodeCamp Social Banner](https://s...,bannerhttpscodecraftersiolandingimagesbyoxbann...


In [42]:
df.clean_text

0     freecodecamp social bannerhttpss3amazonawscomf...
1     996icuhttps996icuen_us\n\nplease note that the...
2      list of free learning resources in many langu...
3      coding interview university\n\n i originally ...
4     div aligncenter\n\ta hrefhttpsvshymanskyygithu...
                            ...                        
95    div aligncenter\n    h1public apish1\n    ia c...
96    p aligncentera hrefhttpsvuejsorg target_blank ...
97     reacthttpsreactjsorg middot github licensehtt...
98    bannerhttpscodecraftersiolandingimagesbyoxbann...
99    div aligncenter\n  img srchttpswwwtensorflowor...
Name: clean_text, Length: 100, dtype: object