In [1]:
# importing libraries
import numpy as np
import pandas as pd

import acquire
import prepare

### Acquiring the raw data

In [2]:
# acquiring the raw data from acquire module function
raw_data = df = pd.DataFrame(acquire.scrape_github_data(), columns= ['repo', 'language', 'readme_contents'])

# previewing raw data
raw_data.head()

Unnamed: 0,repo,language,readme_contents
0,python-discord/bot,Python,# Python Utility Bot\n\n[![Discord][7]][8]\n[!...
1,discord-tickets/bot,JavaScript,[![GitHub stars](https://img.shields.io/github...
2,go-chat-bot/bot,Go,# go-bot\n\n[![Circle CI](https://circleci.com...
3,SuMaiKaDe/bot,Python,\n\n#### 22年2月7日\n - 修改青龙 _id 字段为 id\n#### ...
4,microsoft/BotBuilder-Samples,JavaScript,\r\n# ![Bot Framework Samples](./docs/media/Bo...


In [3]:
# copying over the raw data and saving to df var
df = raw_data.copy()

### Cleaning the raw data

In [4]:
# cleaning function

def clean(df):
    '''
    This function takes in a df of raw data and applies the following cleaning steps, adding
    each step to the df as a column.
    '''
    
    clean = []
    for text in df.readme_contents:
        normalize = prepare.basic_clean(text)
        clean.append(prepare.tokenize(normalize))

    df['clean'] = clean



    # created  stemmed column 
    stemmed=[] 
    for i in df.clean:

        stem = prepare.stem(i)
        stemmed.append(prepare.remove_stopwords(stem, extra_words=['r', 'u', '2', 'ltgt', '4', 'ur', 'k']))

    df['stemmed'] = stemmed



    # created  lemmatized column 
    lemmatized=[] 
    for i in df.clean:

        lemmatize = prepare.lemmatize(i)
        lemmatized.append(prepare.remove_stopwords(lemmatize, extra_words=['r', 'u', '2', 'ltgt', '4', 'ur', 'k']))

    df['lemmatized'] = lemmatized
    
    return df

In [6]:
df = clean(df)
df

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,python-discord/bot,Python,# Python Utility Bot\n\n[![Discord][7]][8]\n[!...,python utility bot\n\ndiscord78\nlint test12\n...,python util bot discord78 lint test12 build34 ...,python utility bot discord78 lint test12 build...
1,discord-tickets/bot,JavaScript,[![GitHub stars](https://img.shields.io/github...,github starshttpsimgshieldsiogithubstarsdiscor...,github starshttpsimgshieldsiogithubstarsdiscor...,github starshttpsimgshieldsiogithubstarsdiscor...
2,go-chat-bot/bot,Go,# go-bot\n\n[![Circle CI](https://circleci.com...,gobot\n\ncircle cihttpscirclecicomghgochatbotb...,gobot circl cihttpscirclecicomghgochatbotbottr...,gobot circle cihttpscirclecicomghgochatbotbott...
3,SuMaiKaDe/bot,Python,\n\n#### 22年2月7日\n - 修改青龙 _id 字段为 id\n#### ...,2227\n _id id\n 21122\n telethon,2227 _id id 21122 telethon,2227 _id id 21122 telethon
4,microsoft/BotBuilder-Samples,JavaScript,\r\n# ![Bot Framework Samples](./docs/media/Bo...,bot framework samplesdocsmediabotframeworksamp...,bot framework samplesdocsmediabotframeworksamp...,bot framework samplesdocsmediabotframeworksamp...
5,GAwesomeBot/bot,JavaScript,# GAwesomeBot\n[![Travis Status](https://travi...,gawesomebot\ntravis statushttpstravisciorggilb...,gawesomebot travi statushttpstravisciorggilber...,gawesomebot travis statushttpstravisciorggilbe...
6,roughike/BottomBar,Java,# BottomBar (Deprecated)\n\nI don't have time ...,bottombar deprecated\n\ni dont have time to ma...,bottombar deprec dont time maintain thi anymor...,bottombar deprecated dont time maintain anymor...
7,mithun-prasad/Bot,C#,# Developing and Deploying Intelligent Chat Bo...,developing and deploying intelligent chat bots...,develop deploy intellig chat bot train resourc...,developing deploying intelligent chat bot trai...
8,boto/boto3,Python,===============================\nBoto3 - The A...,boto3 the aws sdk for python\n\n\nversion gitt...,boto3 aw sdk python version gitter boto3 amazo...,boto3 aws sdk python version gitter boto3 amaz...
9,thinkpixellab/bot,C#,![BOT!](https://github.com/thinkpixellab/bot/r...,bothttpsgithubcomthinkpixellabbotrawmasternet4...,bothttpsgithubcomthinkpixellabbotrawmasternet4...,bothttpsgithubcomthinkpixellabbotrawmasternet4...


## Removing non-english strings

In [7]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
        
    except UnicodeDecodeError:
        return False
    else:
        return True

In [13]:
isEnglish(df.readme_contents[3])

False

In [14]:
df[df.readme_contents.apply(isEnglish) == True]

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,python-discord/bot,Python,# Python Utility Bot\n\n[![Discord][7]][8]\n[!...,python utility bot\n\ndiscord78\nlint test12\n...,python util bot discord78 lint test12 build34 ...,python utility bot discord78 lint test12 build...
2,go-chat-bot/bot,Go,# go-bot\n\n[![Circle CI](https://circleci.com...,gobot\n\ncircle cihttpscirclecicomghgochatbotb...,gobot circl cihttpscirclecicomghgochatbotbottr...,gobot circle cihttpscirclecicomghgochatbotbott...
4,microsoft/BotBuilder-Samples,JavaScript,\r\n# ![Bot Framework Samples](./docs/media/Bo...,bot framework samplesdocsmediabotframeworksamp...,bot framework samplesdocsmediabotframeworksamp...,bot framework samplesdocsmediabotframeworksamp...
5,GAwesomeBot/bot,JavaScript,# GAwesomeBot\n[![Travis Status](https://travi...,gawesomebot\ntravis statushttpstravisciorggilb...,gawesomebot travi statushttpstravisciorggilber...,gawesomebot travis statushttpstravisciorggilbe...
6,roughike/BottomBar,Java,# BottomBar (Deprecated)\n\nI don't have time ...,bottombar deprecated\n\ni dont have time to ma...,bottombar deprec dont time maintain thi anymor...,bottombar deprecated dont time maintain anymor...
7,mithun-prasad/Bot,C#,# Developing and Deploying Intelligent Chat Bo...,developing and deploying intelligent chat bots...,develop deploy intellig chat bot train resourc...,developing deploying intelligent chat bot trai...
8,boto/boto3,Python,===============================\nBoto3 - The A...,boto3 the aws sdk for python\n\n\nversion gitt...,boto3 aw sdk python version gitter boto3 amazo...,boto3 aws sdk python version gitter boto3 amaz...
9,thinkpixellab/bot,C#,![BOT!](https://github.com/thinkpixellab/bot/r...,bothttpsgithubcomthinkpixellabbotrawmasternet4...,bothttpsgithubcomthinkpixellabbotrawmasternet4...,bothttpsgithubcomthinkpixellabbotrawmasternet4...


### Prepare