In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup
import unicodedata
import re

import os
import acquire
import prepare

import time

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [2]:
# set default style for charts
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-dark-palette')

In [3]:
# change jupyter notebook setting to show all rows
pd.set_option('display.max_rows', None)

# Scrape

Here I will be writing the code to scrape github repository links

In [None]:
link = 'https://github.com/search?p=1&q=bot&type=Repositories'

# make the request using the link of the first article and save html response to a variable
response = requests.get(link, headers={'user-agent': 'codeup data science hopper cohort'})
html = response.text
html

In [None]:
# # create soup object
soup = BeautifulSoup(html)
print(soup.prettify())

In [None]:
# # select items that fall under tag 'a', class 'v-align-middle'
repos = soup.select('a.v-align-middle')
repos

In [None]:
# # get links within those items that are assigned tothe attribute 'href'
repos = [repo['href'] for repo in repos]
repos

In [None]:
# commenting out this cell as it takes a while to run and I now have the csv saved locally

# use collector and for loop to gather github links, pausing every 5 for 60 seconds to prevent github excessive collection limitations 

# collector = []

# for i in range(31,61):
#     link = 'https://github.com/search?p={}&q=bot&type=Repositories'
#     response = requests.get(link.format(i), headers={'user-agent': 'codeup data science hopper cohort'})
#     print(response.status_code)
#     html = response.text
#     soup = BeautifulSoup(html)
#     repos = soup.select('a.v-align-middle')
#     repo_links = [repo['href'] for repo in repos]
#     collector += repo_links
#     print(collector[-1])

#     if i % 5 == 0:
#         time.sleep(60)
        
# collector

In [None]:
# # check length of collector
# len(collector)

In [None]:
# # use another for loop to remove the forward slash at the beginning of the links
# accumulator = []
# for item in collector:
#     accumulator.append(item[1:])
# accumulator

In [None]:
# # check if there are any duplicates by converting to set and checking the length
# len(set(accumulator))

# Acquire

Pull in data that was cleaned, prepped, and saved to csv by a teammate

In [4]:
# pull in data
df = prepare.clean_and_filter_data()
df

Unnamed: 0,repo,language,readme_contents,clean,lemmatized
0,python-discord/bot,Python,# Python Utility Bot\n\n[![Discord][7]][8]\n[!...,python utility bot discord78 lint test12 build...,python utility bot discord78 lint test12 build...
1,microsoft/BotBuilder-Samples,JavaScript,\r\n# ![Bot Framework Samples](./docs/media/Bo...,bot framework samplesdocsmediabotframeworksamp...,bot framework samplesdocsmediabotframeworksamp...
2,GAwesomeBot/bot,JavaScript,# GAwesomeBot\n[![Travis Status](https://travi...,gawesomebot travis statushttpstravisciorggilbe...,gawesomebot travis statushttpstravisciorggilbe...
3,roughike/BottomBar,Java,# BottomBar (Deprecated)\n\nI don't have time ...,bottombar deprecated dont time maintain anymor...,bottombar deprecated dont time maintain anymor...
4,mithun-prasad/Bot,C#,# Developing and Deploying Intelligent Chat Bo...,developing deploying intelligent chat bots tra...,developing deploying intelligent chat bot trai...
5,boto/boto3,Python,===============================\nBoto3 - The A...,boto3 aws sdk python version python license bo...,boto3 aws sdk python version python license bo...
6,howdyai/botkit,TypeScript,# ![Botkit](banner.png)\n\n**Botkit is an open...,botkitbannerpng botkit open source developer t...,botkitbannerpng botkit open source developer t...
7,thinkpixellab/bot,C#,![BOT!](https://github.com/thinkpixellab/bot/r...,bothttpsgithubcomthinkpixellabbotrawmasternet4...,bothttpsgithubcomthinkpixellabbotrawmasternet4...
8,gunthercox/ChatterBot,Python,![ChatterBot: Machine learning in Python](http...,chatterbot machine learning pythonhttpsiimgurc...,chatterbot machine learning pythonhttpsiimgurc...
9,boto/boto,Python,####\nDeprecation notice\n####\n\n**This packa...,deprecation notice package longer maintained r...,deprecation notice package longer maintained r...


In [6]:
# create columns with character and word counts of readme text
df = df.assign(character_count= df.lemmatized.str.len(), 
          word_count=df.lemmatized.str.split().apply(len))

# ['character_count'] = x.lemmatized.str.len()
# x['word_count'] =  x.lemmatized.str.split().apply(len)
df

Unnamed: 0,repo,language,readme_contents,clean,lemmatized,character_count,word_count
0,python-discord/bot,Python,# Python Utility Bot\n\n[![Discord][7]][8]\n[!...,python utility bot discord78 lint test12 build...,python utility bot discord78 lint test12 build...,983,53
1,microsoft/BotBuilder-Samples,JavaScript,\r\n# ![Bot Framework Samples](./docs/media/Bo...,bot framework samplesdocsmediabotframeworksamp...,bot framework samplesdocsmediabotframeworksamp...,16954,1070
2,GAwesomeBot/bot,JavaScript,# GAwesomeBot\n[![Travis Status](https://travi...,gawesomebot travis statushttpstravisciorggilbe...,gawesomebot travis statushttpstravisciorggilbe...,1756,134
3,roughike/BottomBar,Java,# BottomBar (Deprecated)\n\nI don't have time ...,bottombar deprecated dont time maintain anymor...,bottombar deprecated dont time maintain anymor...,11295,904
4,mithun-prasad/Bot,C#,# Developing and Deploying Intelligent Chat Bo...,developing deploying intelligent chat bots tra...,developing deploying intelligent chat bot trai...,1793,240
5,boto/boto3,Python,===============================\nBoto3 - The A...,boto3 aws sdk python version python license bo...,boto3 aws sdk python version python license bo...,3693,385
6,howdyai/botkit,TypeScript,# ![Botkit](banner.png)\n\n**Botkit is an open...,botkitbannerpng botkit open source developer t...,botkitbannerpng botkit open source developer t...,3007,209
7,thinkpixellab/bot,C#,![BOT!](https://github.com/thinkpixellab/bot/r...,bothttpsgithubcomthinkpixellabbotrawmasternet4...,bothttpsgithubcomthinkpixellabbotrawmasternet4...,1595,182
8,gunthercox/ChatterBot,Python,![ChatterBot: Machine learning in Python](http...,chatterbot machine learning pythonhttpsiimgurc...,chatterbot machine learning pythonhttpsiimgurc...,4092,319
9,boto/boto,Python,####\nDeprecation notice\n####\n\n**This packa...,deprecation notice package longer maintained r...,deprecation notice package longer maintained r...,4255,556


# Split

Use a function split data into three data sets before exploring

In [None]:
# use function to split data into train, validate, and test
train, validate, test = prepare.split_data(df)
train.shape, validate.shape, test.shape

# Explore

I will be looking into bigrams and trigrams of the lemmatized readme content by programming language to see if there are any differences/commonalities and determine if those can be used to predict the programming language base on readme content.

In [None]:
# check data
train.head()

In [None]:
# check the language value counts
train.language.value_counts()

In [None]:
# combine all lemmatized words by language
python_words = ' '.join(train.lemmatized[train.language == 'Python'].astype(str))
javascript_words = ' '.join(train.lemmatized[train.language == 'JavaScript'].astype(str))
c_words = ' '.join(train.lemmatized[train.language == 'C#'].astype(str))
java_words = ' '.join(train.lemmatized[train.language == 'Java'].astype(str))
typescript_words = ' '.join(train.lemmatized[train.language == 'TypeScript'].astype(str))

In [None]:
# use ngrams to make a list of bigrams by language
top20_python_bigrams = pd.Series(nltk.ngrams(python_words.split(), 2)).value_counts().head(20)
top20_javascript_bigrams = pd.Series(nltk.ngrams(javascript_words.split(), 2)).value_counts().head(20)
top20_c_bigrams = pd.Series(nltk.ngrams(c_words.split(), 2)).value_counts().head(20)
top20_java_bigrams = pd.Series(nltk.ngrams(java_words.split(), 2)).value_counts().head(20)
top20_typescript_bigrams = pd.Series(nltk.ngrams(typescript_words.split(), 2)).value_counts().head(20)

In [None]:
df = pd.DataFrame(top20_python_bigrams).reset_index().rename(columns={'index':'python_bigrams', 0:'p_count'})
df.head()

In [None]:
# plot out top 5 python bigrams
top20_python_bigrams.head().sort_values().plot.barh()

plt.title('Top 5 Python Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurrences')
ticks, _ = plt.yticks()

In [None]:
# plot out top 5 javascript bigrams
top20_javascript_bigrams.head().sort_values().plot.barh()

plt.title('Top 5 JavaScript Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurrences')
ticks, _ = plt.yticks()

In [None]:
# plot out top 5 C# bigrams
top20_c_bigrams.head().sort_values().plot.barh()

plt.title('Top 5 C# Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurrences')
ticks, _ = plt.yticks()

In [None]:
# plot out top 5 java bigrams
top20_java_bigrams.head().sort_values().plot.barh()

plt.title('Top 5 Java Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurrences')
ticks, _ = plt.yticks()

In [None]:
# plot out top 5 typescript bigrams
top20_typescript_bigrams.head().sort_values().plot.barh()

plt.title('Top 5 TypeScript Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurrences')
ticks, _ = plt.yticks()

In [None]:
# create one dataframe that has all the bigrams, their count, and language
df = top20_python_bigrams.reset_index().rename(columns={'index':'bigram', 0:'count'}).assign(language = 'python')
df2 = top20_javascript_bigrams.reset_index().rename(columns={'index':'bigram', 0:'count'}).assign(language = 'javascript')
df3 = top20_c_bigrams.reset_index().rename(columns={'index':'bigram', 0:'count'}).assign(language = 'c#')
df4 = top20_java_bigrams.reset_index().rename(columns={'index':'bigram', 0:'count'}).assign(language = 'java')
df5 = top20_typescript_bigrams.reset_index().rename(columns={'index':'bigram', 0:'count'}).assign(language = 'typescript')
df = df.append([df2, df3, df4, df5], ignore_index=True)
df

In [None]:
# check that all 20 for all languages are accounted for
df.shape

In [None]:
# check for any duplicate values where bigram is in top 20 for 2 or more languages
dupes = df.bigram[df.bigram.duplicated()]
dupes_df = pd.DataFrame(columns=['bigram', 'count', 'language'])
for val in dupes.unique():
    dupes_df = dupes_df.append(df[df.bigram == val], ignore_index=True)
dupes_df

In [None]:
# try to make a bar plot of these duplicates
sns.barplot(x='bigram', y='count', hue='language', data=dupes_df)

In [None]:
train.lemmatized.str.len()

In [None]:
x = df
x.head()

In [None]:
x = x.dropna()

In [None]:
# create column with count of characters in clean message
x['character_count'] = x.lemmatized.str.len()
# create column with count of worlds in clean message
x['word_count'] =  x.lemmatized.str.split().apply(len)

In [None]:
x.head()

# Is there a difference in repository mean word count by language?

In [None]:
b = x.groupby('language')[['character_count', 'word_count']].agg('mean').reset_index()
b

In [None]:
b = b.rename(columns={})
b
# b.columns = ['language', 'ch_count_mean', 'ch_count_min', 'ch_count_max', 'word_count_mean', word_count_mean]

In [None]:
sns.barplot(x='language', y=('word_count', 'mean'), data=b);

In [None]:
b[('language','')].unique

In [None]:
b.columns.get_level_values(1)