In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import acquire as a
import prepare as p
import itertools
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import re

In [2]:
# def web_scrape_repos():
    
#     search_topics = "https://github.com/search?p="

#     REPOS = []
    
#     for page in range(1, 20):

#         req = requests.get(search_topics + str(page) + "&q=" + 'bitcoin' + "&type=Repositories")
#         soup = BeautifulSoup(req.text, 'html.parser')

#         repos = soup.find_all('a', class_='v-align-middle')
#         for link in repos:
#             REPOS.append(link['href'][1:])
    
#     return REPOS

In [3]:
# REPOS = a.web_scrape_repos()
# REPOS

In [4]:
df = pd.read_json('data2.json')

In [5]:
df.shape

(250, 3)

In [6]:
#content = p.clean_text(' '.join(df[df['readme_contents']]))
 
#content

In [7]:
def basic_clean(original):
    '''
    This function takes in a string and lowers the case, normalizes unicode characters, 
    and uses regex to replace anything that is not a letter, number, whitespace or a single quote.
    '''
    # lower it
    article = original.lower()
    # normalize it
    article = unicodedata.normalize('NFKD', article).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # regex it 
    article = re.sub(r'[^a-z0-9\s]', ' ', article)
    
    return article

In [8]:
def tokenize(article):
    '''
    This function takes in a string and tokenizes all the words in the string.
    '''
    # create object
    tokenize = nltk.tokenize.ToktokTokenizer()
    # spit out tokenize
    string = tokenize.tokenize(article, return_str=True)
    
    return article

In [9]:
def stem(article):
    '''
    This function will take in a string and run a PorterStemmer object
    - will return a string of all the stems of the words in the article
    '''
    
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    
    stems = [ps.stem(word) for word in article.split()]
    # run stemmer object on article to create stems
    
    article_stemmed = ' '.join(stems)
    # create stemmed article by joining stems
    
    return article_stemmed

In [10]:
def lemmatize(article):
    '''
    This functions will take in a string and run a WordNetLemmatizer object
    - will return a string of lemmatized words from the article
    '''
    
    wnl = nltk.stem.WordNetLemmatizer()
    # create lemmatizer object
    
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    ## run lemmatizer object on article
    
    article_lemmatized = ' '.join(lemmas)
    # create lemmatized article by joining lemmatized words together
    
    return article_lemmatized

In [11]:
def remove_stopwords(article):
    '''
    This function will take in a string in the form of an article and remove standard English stop words
    - will return a string of remaining words once all desired stop words have been removed 
    '''
    
    stopword_list = stopwords.words('english')
    # create standard English stop words list
    
    words = article.split()
    # split article into individual words
    
    filtered_words = [w for w in words if w not in stopword_list]
    # filter for words in stop words
    
    article_without_stopwords = ' '.join(filtered_words)
    # recreate article out of remaining words
    
    return article_without_stopwords

In [12]:
def prepare_bitcoin(content):
    '''
    This function will prepare data from from the df so that it can be used in NLP
    models and exploration
    - will take in a string a clean it
        -lowercase
        -remove accented and special characters
    - will tokenize the string and return the seperated words
    - will lemmatize the content
    - will remove standard english stopwords
    '''
    
    # run cleaning function
    clean_content = basic_clean(content)
    
    # run tokenize function
    tokenized_content = p.tokenize(clean_content)
    
    # lemmatize content
    lemmatized_content = lemmatize(tokenized_content)
    
    # remove stopwords
    final_content = remove_stopwords(lemmatized_content)
    
    return final_content

In [13]:
#df.readme_contents = df.readme_contents.apply(prepare_bitcoin)

In [14]:
# #chain together clean, tokenize, remove stopwords
# df['clean'] = df['readme_contents'].apply(basic_clean)

In [15]:
# df

In [16]:
# df['tokenize'] = df['readme_contents'].apply(tokenize)
# df['stem'] = df['readme_contents'].apply(stem)

In [17]:
# df

In [18]:
# df['no_stopwords'] = df['readme_contents'].apply(remove_stopwords)

In [19]:
# df

In [20]:
def prep_gh_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    #original text from content column
    df['original'] = df['readme_contents']
    
    #chain together clean, tokenize, remove stopwords
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    #chain clean, tokenize, stem, remove stopwords
    df['stemmed'] = df['clean'].apply(stem)
    
    #clean clean, tokenize, lemmatize, remove stopwords
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', 'original', 'clean', 'stemmed', 'lemmatized']]

In [21]:
#prep_gh_data(df, df.readme_contents, extra_words=['r', 'u', '2', 'ltgt'], exclude_words=[])

In [22]:
# p.clean_text('readme_contents', extra_stopwords=['r', 'u', '2', 'ltgt'])

In [23]:

for i, row in df.iterrows():
    if str(row['language']) in ['Python','JavaScript','C++']:
        continue
    else:
        df.iloc[i]['language'] = 'Other'

In [24]:
df

Unnamed: 0,repo,language,readme_contents
0,bitcoin/bitcoin,C++,Bitcoin Core integration/staging tree\n=======...
1,bitcoinbook/bitcoinbook,Other,Code Examples: ![travis_ci](https://travis-ci....
2,bitcoinj/bitcoinj,Other,image:https://github.com/bitcoinj/bitcoinj/wor...
3,bitcoin/bips,Other,"People wishing to submit BIPs, first should pr..."
4,bitcoinjs/bitcoinjs-lib,Other,# BitcoinJS (bitcoinjs-lib)\n[![Github CI](htt...
...,...,...,...
245,Bitcoin-com/paperwallet.bitcoin.com,Other,# Bitcoin.com Paper Wallet\n\nThe Bitcoin.com ...
246,DeFiCh/ain,C++,[![Lint](https://github.com/DeFiCh/ain/actions...
247,rustyrussell/bitcoin-iterate,Other,This is some fast code to iterate over bitcoin...
248,gcarq/rusty-blockparser,Other,# rusty-blockparser\n\n[![Build Status](https:...


In [25]:
df.shape

(250, 3)