In [2]:
import matplotlib.pyplot as plt
from requests import get
from bs4 import BeautifulSoup
import seaborn as sns
import pandas as pd
import nltk
import unicodedata


import env
import acquire

# acquire phase

In [3]:
def to_update_or_not_to_udpate(update_flag=True,repo_count=100,make_csv=True):
    ''' 
    optional inputs of updating, counts, and making csv
    if updating it will pull from most forked repos, at given (default 100 count), then make a csv
    otherwise pulls from csv
    '''
    import acquire
    from os.path import exists
    from datetime import datetime
    
    if update_flag:
        list_repo = []
        pages = (repo_count//10)+1
        for i in range(1,(1+pages)): ##numbers are pages, 10 per page
            response = get('https://github.com/search?o=desc&q=stars:%3E1&s=forks&type=Repositories'.format(i))
            soup = BeautifulSoup(response.content, 'html.parser')
            
            for repo in soup.find_all('a', class_ = 'v-align-middle'):
                list_repo.append(repo.text)

        dictionary_of_repos = acquire.scrape_github_data(list_repo[:repo_count])
        df = pd.DataFrame(dictionary_of_repos)

        # if bored in future try to have csv pull from most recent date time
        # df.to_csv(f'github_forked{datetime.now().strftime("%d/%m/%Y/%H:%M")}.csv', index=False)
        if make_csv:
            df.to_csv(f'github_forked.csv', index=False)
    else:
        if exists('github_forked.csv'):
            df = pd.read_csv('github_forked.csv')
        else:
            print("can not find file, please update instead")
        
    return df

In [4]:
df = to_update_or_not_to_udpate(False,100,False)
df

Unnamed: 0,repo,language,readme_contents
0,jtleek/datasharing,,How to share data with a statistician\n=======...
1,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...
2,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...
3,SmartThingsCommunity/SmartThingsPublic,Groovy,# Welcome to the SmartThings Public GitHub Rep...
4,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www...."
...,...,...,...
95,github/gitignore,,# A collection of `.gitignore` templates\n\nTh...
96,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot..."
97,Pierian-Data/Complete-Python-3-Bootcamp,Jupyter Notebook,# Complete-Python-3-Bootcamp\nCourse Files for...
98,nightscout/cgm-remote-monitor,JavaScript,Nightscout Web Monitor (a.k.a. cgm-remote-moni...


# Prepare phase

In [5]:

def remove_stopwords(article_processed,words_to_add=[],words_to_remove=[]):
    ''' 
    takes in string, and two lists
    creates list of words to remove from nltk, modifies as dictated in arguements
    prints result of processing
    returns resulting string
    '''
    from nltk.corpus import stopwords
    #create the stopword list
    stopwords_list = stopwords.words("english")
    #modify stopword list
    [stopwords_list.append(word) for word in words_to_add]
    [stopwords_list.remove(word) for word in words_to_remove]
    #remove using stopword list
    words = article_processed.split()
    filtered_words = [w for w in words if w not in stopwords_list]
    #filtered_words =[word for word in article_processed if word not in stopwords_list]
    print("removed ",len(article_processed)-len(filtered_words), "words")
    #join back
    article_without_stopwords = " ".join(filtered_words)
    return article_without_stopwords

def lemmatize(article):
    ''' 
    input article
    makes object, applies to string, and returns results
    '''
    import nltk
    #create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    #use lemmatizer
    lemmatized = [wnl.lemmatize(word) for word in article.split()]
    #join words back together
    article_lemmatized = " ".join(lemmatized)
    return article_lemmatized

def stem(article):
    ''' 
    input string
    create object, apply it to the each in string, rejoin and return
    '''
    import nltk
    #create porter stemmer
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in article.split()]
    #join words back together
    article_stemmed = " ".join(stems)
    return article_stemmed

def tokenize(article0):
    ''' 
    input string
    creates object, returns string after object affect
    '''
    import nltk
    #create the tokenizer
    tokenize = nltk.tokenize.ToktokTokenizer()
    #use the tokenizer
    article = tokenize.tokenize(article0,return_str=True)
    return article

def basic_clean(article0):
    ''' 
    input string
    lowers cases, makes "normal" characters, and removes anything not expected
    returns article
    '''
    import unicodedata
    import re
    #lower cases
    article = article0.lower()
    ## decodes to change to "normal" characters after encoding to ascii from a unicode normalize
    article = unicodedata.normalize("NFKD",article).encode("ascii","ignore").decode("utf-8")
    # removes anything not lowercase, number, single quote, or a space
    article = re.sub(r'[^a-z0-9\'\s]','',article)
    return article

def basic_pipeline(codeup=True,news=True,words_keep=[],words_drop=[]):
    '''
    
    '''
    import acquire
    import pandas as pd

    #acquire
    news_df = pd.DataFrame(acquire.get_news_articles())
    codeup_df = pd.DataFrame(acquire.get_blog_content("https://codeup.com/blog/"))

    if codeup:
        codeup_df.rename(columns={"content":"original"},inplace=True)
        codeup_df["clean"] = [remove_stopwords(tokenize(basic_clean(each)),words_to_add=words_keep,words_to_remove=words_drop) for each in codeup_df.original]
        codeup_df["stemmed"] = codeup_df.clean.apply(stem)
        codeup_df["lemmatized"] = codeup_df.clean.apply(lemmatize)

    if news:
        news_df.rename(columns={"content":"original"},inplace=True),news_df.drop(columns="category",inplace=True)
        news_df["clean"] = [remove_stopwords(tokenize(basic_clean(each)),words_to_add=words_keep,words_to_remove=words_drop) for each in news_df.original]
        news_df["stemmed"] = news_df.clean.apply(stem)
        news_df["lemmatized"] = news_df.clean.apply(lemmatize)

    return codeup_df,news_df
    

## starting the prepare on the dataframe

In [18]:
df = df[df.language.isna()==False]
df["clean"] = [remove_stopwords(tokenize(basic_clean(each))) for each in df.readme_contents]
df["stemmed"] = df.clean.apply(stem)
df["lemmatized"] = df.clean.apply(lemmatize)

removed  3313 words
removed  668 words
removed  1029 words
removed  8827 words
removed  10356 words
removed  204 words
removed  46160 words
removed  3313 words
removed  668 words
removed  1029 words
removed  8827 words
removed  10356 words
removed  204 words
removed  46160 words
removed  3313 words
removed  668 words
removed  1029 words
removed  8827 words
removed  10356 words
removed  204 words
removed  46160 words
removed  3313 words
removed  668 words
removed  1029 words
removed  8827 words
removed  10356 words
removed  204 words
removed  46160 words
removed  3313 words
removed  668 words
removed  1029 words
removed  8827 words
removed  10356 words
removed  204 words
removed  46160 words
removed  3313 words
removed  668 words
removed  1029 words
removed  8827 words
removed  10356 words
removed  204 words
removed  46160 words
removed  3313 words
removed  668 words
removed  1029 words
removed  8827 words
removed  10356 words
removed  204 words
removed  46160 words
removed  3313 words


In [25]:
(" ".join(df[df["language"]=="R"]["lemmatized"])).split()

['introduction',
 'second',
 'programming',
 'assignment',
 'require',
 'write',
 'r',
 'function',
 'able',
 'cache',
 'potentially',
 'timeconsuming',
 'computation',
 'example',
 'taking',
 'mean',
 'numeric',
 'vector',
 'typically',
 'fast',
 'operation',
 'however',
 'long',
 'vector',
 'may',
 'take',
 'long',
 'compute',
 'mean',
 'especially',
 'computed',
 'repeatedly',
 'eg',
 'loop',
 'content',
 'vector',
 'changing',
 'may',
 'make',
 'sense',
 'cache',
 'value',
 'mean',
 'need',
 'looked',
 'cache',
 'rather',
 'recomputed',
 'programming',
 'assignment',
 'take',
 'advantage',
 'scoping',
 'rule',
 'r',
 'language',
 'manipulated',
 'preserve',
 'state',
 'inside',
 'r',
 'object',
 'example',
 'caching',
 'mean',
 'vector',
 'example',
 'introduce',
 'operator',
 'used',
 'assign',
 'value',
 'object',
 'environment',
 'different',
 'current',
 'environment',
 'two',
 'function',
 'used',
 'create',
 'special',
 'object',
 'store',
 'numeric',
 'vector',
 'cache',
 'm

In [26]:
lang_dict={}
for lang in df["language"].unique():
   lang_dict = {"Language":lang,
                "words":(" ".join(df[df["language"]==lang]["lemmatized"])).split()
                }
lang_dict

{'Language': 'Jupyter Notebook',
 'words': ['completepython3bootcamp',
  'course',
  'file',
  'complete',
  'python',
  '3',
  'bootcamp',
  'course',
  'udemy',
  'copyright',
  'pierian',
  'data',
  'inc',
  'get',
  '95',
  'link',
  'httpswwwudemycomcompletepythonbootcampcouponcodecompletegithub',
  'thanks',
  'completepython3bootcamp',
  'course',
  'file',
  'complete',
  'python',
  '3',
  'bootcamp',
  'course',
  'udemy',
  'copyright',
  'pierian',
  'data',
  'inc',
  'get',
  '95',
  'link',
  'httpswwwudemycomcompletepythonbootcampcouponcodecompletegithub',
  'thanks',
  'completepython3bootcamp',
  'course',
  'file',
  'complete',
  'python',
  '3',
  'bootcamp',
  'course',
  'udemy',
  'copyright',
  'pierian',
  'data',
  'inc',
  'get',
  '95',
  'link',
  'httpswwwudemycomcompletepythonbootcampcouponcodecompletegithub',
  'thanks',
  'completepython3bootcamp',
  'course',
  'file',
  'complete',
  'python',
  '3',
  'bootcamp',
  'course',
  'udemy',
  'copyright