# Goals

 1) Construct a web scraper capable of gathering data from github repositories. Data collected should include the main programing language used in that repository, and the contents of that repository’s readme file.

 2) Use natural language processing to develop a model to predict each repositories programming language based on the contents of that repository’s readme file.

# Imports

In [1]:
import re
import unicodedata
import pandas as pd
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

import acquire as a # produces output sometimes
import prepare as p
import explore as e

/xtaci/algorithms
/xtaci/algorithms
/xtaci/algorithms
/zaphoyd/websocketpp
/lballabio/QuantLib
/miao1007/Openwrt-NetKeeper
/chriskohlhoff/asio
/tencent-wechat/libco
/ethereum/aleth
/eranif/codelite
/Project-OSRM/osrm-backend
/open-source-parsers/jsoncpp
/ShivamSarodia/ShivyC
/ShivamSarodia/ShivyC
/ShivamSarodia/ShivyC
/nok/sklearn-porter
/aitjcize/cppman
/pytries/marisa-trie
/HouJP/kaggle-quora-question-pairs
/pseudo-lang/pseudo
/PaulSec/twittor
/mehulj94/BrainDamage
/annoviko/pyclustering
/0xgalz/Virtuailor
/FooBarWidget/boyer-moore-horspool
/FooBarWidget/boyer-moore-horspool
/FooBarWidget/boyer-moore-horspool
/honghuachen/Usdk
/cplusplus/fundamentals-ts
/lucidspriter/SpriterPlusPlus
/ljw1004/csharpspec
/cliftonm/MerkleTree
/danieldickison/CocoaOSC
/liufeihong/Hyper-Upload-Server
/boutell/cgic
/kennethreitz/context


# Acqusition

In [2]:
# Calles functions in acquire.py to scrape git hub repositories and create a dictionary containing the repo url,
# language, and readme contents of each folder.

# Preperation

In [3]:
# Calls functions in prepare.py. Functions convert the dictionaries in the json file into a data frame. 
# Then add two columns to the data frame, applying basic cleaning algorithms to the readme contents. 
# Stemming is then applied to one of those columns. Lemmatize is applied to the other.
# Functions also drop rows that did not produce usable data
df = p.prep_readme()

# Exploration

In [4]:
# First peek at prepared data.
df.head()

Unnamed: 0,language,readme_contents,repo,readme_contents_stemmed,readme_contents_lemmatized
0,CSS,# Welcome\n\n> **Warning**: this book is **not...,/nakov/Practical-Cryptography-for-Developers-Book,welcom warn book finish still work chapter com...,welcome warning book finished still working ch...
1,Java,# Monorepo of Deeplearning4j\n\nWelcome to the...,/eclipse/deeplearning4j,monorepo deeplearn j welcom new monorepo deepl...,monorepo deeplearning j welcome new monorepo d...
2,Java,# H2O\n\n[![Join the chat at https://gitter.im...,/h2oai/h2o-3,h join chat http gitter im h oai h http badg g...,h join chat http gitter im h oai h http badge ...
3,C++,"<div align=""center"">\n <img src=""https://www....",/tensorflow/tensorflow,div align center img src http www tensorflow o...,div align center img src http www tensorflow o...
4,C++,gRPC - An RPC library and framework\n=========...,/grpc/grpc,grpc rpc librari framework grpc modern open so...,grpc rpc library framework grpc modern open so...


In [5]:
# dropping repo column. The goal is to predict language using readme contents so repo it is not useful
df.drop(columns='repo',inplace=True)

In [6]:
# dropping readme_contents. The column was there for comparison to insure prepre functions worked as expected.
# It is no longer needed.
df.drop(columns='readme_contents',inplace=True)

In [7]:
df.head()

Unnamed: 0,language,readme_contents_stemmed,readme_contents_lemmatized
0,CSS,welcom warn book finish still work chapter com...,welcome warning book finished still working ch...
1,Java,monorepo deeplearn j welcom new monorepo deepl...,monorepo deeplearning j welcome new monorepo d...
2,Java,h join chat http gitter im h oai h http badg g...,h join chat http gitter im h oai h http badge ...
3,C++,div align center img src http www tensorflow o...,div align center img src http www tensorflow o...
4,C++,grpc rpc librari framework grpc modern open so...,grpc rpc library framework grpc modern open so...


### Data Dictionary

# 1) language - main programing language used by the repository add how this was determined

2) readme_contents_stemmed - contents of readme file cleaned and stemed

3) readme_contents_lemmatized - contents of readme file cleaned and lemmatized

#### looking at initial data statistics

In [8]:
df.shape

(105, 3)

In [9]:
# shows the number and percent of repositories that represent each language
labels = pd.concat([df.language.value_counts(),
                    df.language.value_counts(normalize=True)], axis=1)
labels.columns = ['number', 'percent']
labels

Unnamed: 0,number,percent
C++,20,0.190476
HTML,20,0.190476
Python,15,0.142857
Java,10,0.095238
JavaScript,10,0.095238
C,10,0.095238
Shell,10,0.095238
CSS,10,0.095238


In [12]:
# gets a list of words tied to each language
CPP_words = e.word_soup(' '.join(df[df.language == 'C++'].readme_contents_lemmatized))
HTML_words = e.word_soup(' '.join(df[df.language == 'HTML'].readme_contents_lemmatized))
Python_words = e.word_soup(' '.join(df[df.language == 'Python'].readme_contents_lemmatized))
CSS_words = e.word_soup(' '.join(df[df.language == 'CSS'].readme_contents_lemmatized))
C_words = e.word_soup(' '.join(df[df.language == 'C'].readme_contents_lemmatized))
Java_words = e.word_soup(' '.join(df[df.language == 'Java'].readme_contents_lemmatized))
JavaScript_words = e.word_soup(' '.join(df[df.language == 'JavaScript'].readme_contents_lemmatized))
Shell_words = e.word_soup(' '.join(df[df.language == 'Shell'].readme_contents_lemmatized))
all_words = e.word_soup(' '.join(df.readme_contents_lemmatized))

In [13]:
# gets frequency of each word tied to each language
CPP_freq = pd.Series(CPP_words).value_counts()
HTML_freq = pd.Series(HTML_words).value_counts()
Python_freq = pd.Series(Python_words).value_counts()
CSS_freq = pd.Series(CSS_words).value_counts()
C_freq = pd.Series(C_words).value_counts()
Java_freq = pd.Series(Java_words).value_counts()
JavaScript_freq = pd.Series(JavaScript_words).value_counts()
Shell_freq = pd.Series(Shell_words).value_counts()
all_freq = pd.Series(all_words).value_counts()

In [14]:
# creates a data frame that shows the frequency of each word across each language
word_counts = (pd.concat([all_freq,CPP_freq,HTML_freq,Python_freq,CSS_freq,C_freq,Java_freq,JavaScript_freq,Shell_freq], axis=1, sort=True)
                .set_axis(['all', 'C++', 'HTML', 'Python', 'CSS', 'C', 'Java', 'JavaScript', 'Shell'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))

In [15]:
word_counts.head()

Unnamed: 0,all,C++,HTML,Python,CSS,C,Java,JavaScript,Shell
aa,14,1,3,0,0,0,0,10,0
aaa,5,0,0,0,0,0,0,1,4
aaaaaelftksuqmcc,1,1,0,0,0,0,0,0,0
aaaaaxrstlmaqobyzgaaaafis,1,1,0,0,0,0,0,0,0
aaaaikleqvqi,1,1,0,0,0,0,0,0,0


In [16]:
# shows the 10 most frequently occuring words
word_counts.sort_values(by='all', ascending=False).head(10)

Unnamed: 0,all,C++,HTML,Python,CSS,C,Java,JavaScript,Shell
http,4434,710,421,1385,258,136,357,855,312
com,2926,418,290,955,193,45,220,615,190
github,1538,244,80,613,149,34,177,112,129
python,1189,73,85,550,53,63,48,43,274
org,714,177,79,261,27,42,75,27,26
www,693,63,123,193,34,7,31,188,54
c,657,60,74,128,15,52,77,162,89
href,602,31,402,107,3,0,24,34,1
md,600,51,185,43,12,15,61,179,54
x,513,31,15,294,20,22,50,16,65


In [17]:
# shows words that are unique to each language
Shell_unique = word_counts[(word_counts['C++'] == 0) & (word_counts['HTML'] == 0) &(word_counts['Python'] == 0) & (word_counts['CSS'] == 0) & (word_counts['C'] == 0) & (word_counts['Java'] == 0) & (word_counts['JavaScript'] == 0)].sort_values(by='Shell', ascending=False)
CPP_unique = word_counts[(word_counts['Shell'] == 0) & (word_counts['HTML'] == 0) &(word_counts['Python'] == 0) & (word_counts['CSS'] == 0) & (word_counts['C'] == 0) & (word_counts['Java'] == 0) & (word_counts['JavaScript'] == 0)].sort_values(by='C++', ascending=False)
HTML_unique = word_counts[(word_counts['C++'] == 0) & (word_counts['Shell'] == 0) &(word_counts['Python'] == 0) & (word_counts['CSS'] == 0) & (word_counts['C'] == 0) & (word_counts['Java'] == 0) & (word_counts['JavaScript'] == 0)].sort_values(by='HTML', ascending=False)
Python_unique = word_counts[(word_counts['C++'] == 0) & (word_counts['HTML'] == 0) &(word_counts['Shell'] == 0) & (word_counts['CSS'] == 0) & (word_counts['C'] == 0) & (word_counts['Java'] == 0) & (word_counts['JavaScript'] == 0)].sort_values(by='Python', ascending=False)
CSS_unique = word_counts[(word_counts['C++'] == 0) & (word_counts['HTML'] == 0) &(word_counts['Python'] == 0) & (word_counts['Shell'] == 0) & (word_counts['C'] == 0) & (word_counts['Java'] == 0) & (word_counts['JavaScript'] == 0)].sort_values(by='CSS', ascending=False)
C_unique = word_counts[(word_counts['C++'] == 0) & (word_counts['HTML'] == 0) &(word_counts['Python'] == 0) & (word_counts['CSS'] == 0) & (word_counts['Shell'] == 0) & (word_counts['Java'] == 0) & (word_counts['JavaScript'] == 0)].sort_values(by='C', ascending=False)
Java_unique = word_counts[(word_counts['C++'] == 0) & (word_counts['HTML'] == 0) &(word_counts['Python'] == 0) & (word_counts['CSS'] == 0) & (word_counts['C'] == 0) & (word_counts['Shell'] == 0) & (word_counts['JavaScript'] == 0)].sort_values(by='Java', ascending=False)
JavaScript_unique = word_counts[(word_counts['C++'] == 0) & (word_counts['HTML'] == 0) &(word_counts['Python'] == 0) & (word_counts['CSS'] == 0) & (word_counts['C'] == 0) & (word_counts['Java'] == 0) & (word_counts['Shell'] == 0)].sort_values(by='JavaScript', ascending=False)

In [18]:
# idea create list of stop words that are all values except unique-to-language words from train group then
# try to predict test group

In [19]:
# looking at bigrams
# shows bigrams for data set
Shell_bigrams = pd.Series(nltk.ngrams(Shell_words, 2))
CPP_bigrams = pd.Series(nltk.ngrams(CPP_words, 2))
HTML_bigrams = pd.Series(nltk.ngrams(HTML_words, 2))
Python_bigrams = pd.Series(nltk.ngrams(Python_words, 2))
CSS_bigrams = pd.Series(nltk.ngrams(CSS_words, 2))
C_bigrams = pd.Series(nltk.ngrams(C_words, 2))
Java_bigrams = pd.Series(nltk.ngrams(Java_words, 2))
JavaScript_bigrams = pd.Series(nltk.ngrams(JavaScript_words, 2))

In [20]:
# Idea Vectorize using bigrams...ngrams

In [21]:
# Idea Could try looking at frequency of bigrams

In [22]:
Shell_bigrams_freq = pd.Series(Shell_bigrams).value_counts()

In [23]:
Shell_bigrams_freq.head()

(github, com)          103
(http, github)         102
(http, www)             52
(passenger, docker)     48
(pyenv, virtualenv)     45
dtype: int64

In [24]:
# Decided to drop readme_contents_stemmed. Limmitizing is reputed to the most accurate and because the data set is
# small the trade of in quicker computation time is negligable. There is also insufficient time to explore the 
# stem option any further.
df.drop(columns='readme_contents_stemmed',inplace=True)

In [25]:
df.head()

Unnamed: 0,language,readme_contents_lemmatized
0,CSS,welcome warning book finished still working ch...
1,Java,monorepo deeplearning j welcome new monorepo d...
2,Java,h join chat http gitter im h oai h http badge ...
3,C++,div align center img src http www tensorflow o...
4,C++,grpc rpc library framework grpc modern open so...


# Modeling

In [36]:
# Create baseline model

# vectorize stemmed readme contents and assign X any y veriables
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.readme_contents_lemmatized)
y = df.language

# Split Data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = .2, random_state = 123)
    
# Create Random Forest object and fit it to the data
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)
        
rf.fit(X_train, y_train)

# print accuracy for train and test data sets
print('Accuracy of random forest classifier on training set: {:.2f}'
      .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on training set: 0.69
Accuracy of random forest classifier on training set: 0.29


Because there are 8 total languages we would expect an accuracy rating of .13 (rounded).

The Baseline model performed better than chance at .29

In [39]:
# Try baseline model using bigrams

# vectorize stemmed readme contents and assign X any y veriables
tfidf = TfidfVectorizer(ngram_range=(2, 2))
X = tfidf.fit_transform(df.readme_contents_lemmatized)
y = df.language

# Split Data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = .2, random_state = 123)
    
# Create Random Forest object and fit it to the data
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)
        
rf.fit(X_train, y_train)

# print accuracy for train and test data sets
print('Accuracy of random forest classifier on training set: {:.2f}'
      .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on training set: 0.39
Accuracy of random forest classifier on training set: 0.29


Accuracy of model does not improve with the use of bigrams

In [40]:
# Try baseline model using trigrams

# vectorize stemmed readme contents and assign X any y veriables
tfidf = TfidfVectorizer(ngram_range=(3, 3))
X = tfidf.fit_transform(df.readme_contents_lemmatized)
y = df.language

# Split Data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = .2, random_state = 123)
    
# Create Random Forest object and fit it to the data
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)
        
rf.fit(X_train, y_train)

# print accuracy for train and test data sets
print('Accuracy of random forest classifier on training set: {:.2f}'
      .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on training set: 0.33
Accuracy of random forest classifier on training set: 0.38


Accuracy of model impoves to .38 using trigrams

In [41]:
# Try baseline model using Tetragrams

# vectorize stemmed readme contents and assign X any y veriables
tfidf = TfidfVectorizer(ngram_range=(4, 4))
X = tfidf.fit_transform(df.readme_contents_lemmatized)
y = df.language

# Split Data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = .2, random_state = 123)
    
# Create Random Forest object and fit it to the data
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)
        
rf.fit(X_train, y_train)

# print accuracy for train and test data sets
print('Accuracy of random forest classifier on training set: {:.2f}'
      .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on training set: 0.35
Accuracy of random forest classifier on training set: 0.33


In [42]:
# Try baseline model using the two most accurate grams (trigrams and tetragrams)

# vectorize stemmed readme contents and assign X any y veriables
tfidf = TfidfVectorizer(ngram_range=(4, 4))
X = tfidf.fit_transform(df.readme_contents_lemmatized)
y = df.language

# Split Data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = .2, random_state = 123)
    
# Create Random Forest object and fit it to the data
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)
        
rf.fit(X_train, y_train)

# print accuracy for train and test data sets
print('Accuracy of random forest classifier on training set: {:.2f}'
      .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on training set: 0.35
Accuracy of random forest classifier on training set: 0.33


### Model using only tetragrams has shown the best result .38

In [43]:
# Create model using go-words

# Split Data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = .2, random_state = 123)