In [24]:
import re, os
import unicodedata
import json

import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
import nltk.sentiment
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from time import strftime

from wordcloud import WordCloud

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer


from requests import get
from bs4 import BeautifulSoup

from wordcloud import WordCloud

import acquire
import acquire_jg
import prepare_jag


plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

pd.set_option('display.max_rows', 200)


---

## Acquire data

In [25]:
""" def get_all_repository_urls(url): # repository url
    # Get max page.
    response = get(url)
    soup = BeautifulSoup(response.text)
    print('Finding max page for repositories...')
    max_page = int(soup.find('div', role='navigation').text[-6])
    print(f'Max page found: {max_page}')
    page = 1
    repository_links = []
    print('Starting loop...')
    for n in range(max_page):
        print(n+1, 'iteration')
        print(f'Pulling data from {url}')
        # Reset soup.
        response = get(url)
        soup = BeautifulSoup(response.text)
        # Get all the repositories from the page.
        repositories = soup.find_all('a', itemprop='name codeRepository') 
        print('Fetching links for repositories...')
        for repo in repositories:
            repository_links.append(repo.get('href'))
        git = 'https://github.com/'
        next_page = soup.find('a', class_='next_page').get('href')[:-1]
        ## Use this line of code to get the url for the next page.
        if page <= 4:
            url = git + next_page + str(page + 1)
            page += 1
        else:
            return repository_links
 """

" def get_all_repository_urls(url): # repository url\n    # Get max page.\n    response = get(url)\n    soup = BeautifulSoup(response.text)\n    print('Finding max page for repositories...')\n    max_page = int(soup.find('div', role='navigation').text[-6])\n    print(f'Max page found: {max_page}')\n    page = 1\n    repository_links = []\n    print('Starting loop...')\n    for n in range(max_page):\n        print(n+1, 'iteration')\n        print(f'Pulling data from {url}')\n        # Reset soup.\n        response = get(url)\n        soup = BeautifulSoup(response.text)\n        # Get all the repositories from the page.\n        repositories = soup.find_all('a', itemprop='name codeRepository') \n        print('Fetching links for repositories...')\n        for repo in repositories:\n            repository_links.append(repo.get('href'))\n        git = 'https://github.com/'\n        next_page = soup.find('a', class_='next_page').get('href')[:-1]\n        ## Use this line of code to get the 

In [26]:
#repository_links = get_all_repository_urls('https://github.com/orgs/apple/repositories')

In [27]:
df = pd.read_json('data.json')

In [28]:
df.language.value_counts()

Swift               65
Python              38
C++                 15
C                   11
JavaScript           2
Java                 2
Jupyter Notebook     2
HTML                 2
Dockerfile           1
Markdown             1
Shell                1
R                    1
Starlark             1
LLVM                 1
Name: language, dtype: int64

Looks like our categories for classification will be Swift, Python, C (combined C & C++), and other. Can make a new column mapping that target.

---

## Prepare Data

- Look for / handle null values
- Add column for target language class
- Add column for clean, lemmatized, etc.
- Split word list by language

In [31]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,apple/llvm-project,,# Apple's fork of llvm-project\n\nThis is Appl...
1,apple/swift-argument-parser,Swift,# Swift Argument Parser\n\n## Usage\n\nBegin b...
2,apple/swift-docc,Swift,# Swift-DocC\n\nSwift-DocC is a documentation ...
3,apple/swift,C++,"<img src=""https://swift.org/assets/images/swif..."
4,apple/sourcekit-lsp,Swift,# SourceKit-LSP\n\nSourceKit-LSP is an impleme...


In [33]:
df.isna().sum()

repo               0
language           8
readme_contents    0
dtype: int64

- [ ] For some reason, my `.isna()` count is missing 4 rows withoout readmes
  - 114
  - 135
  - 144
  - 150
- Figure out why and drop these

In [34]:
df = prepare_jag.prep_article_data(df, 'original')

In [35]:
df.shape

(151, 7)

In [36]:
df.head(20)

Unnamed: 0,repo,language,target,original,clean,stemmed,lemmatized
0,apple/llvm-project,LLVM,other,# Apple's fork of llvm-project\n\nThis is Appl...,apple ' fork llvmproject apple ' fork llvmproj...,appl ' fork llvmproject thi appl ' fork llvmpr...,apple ' fork llvmproject apple ' fork llvmproj...
1,apple/swift-argument-parser,Swift,swift,# Swift Argument Parser\n\n## Usage\n\nBegin b...,swift argument parser usage begin declaring ty...,swift argument parser usag begin declar type d...,swift argument parser usage begin declaring ty...
2,apple/swift-docc,Swift,swift,# Swift-DocC\n\nSwift-DocC is a documentation ...,swiftdocc swiftdocc documentation compiler swi...,swiftdocc swiftdocc document compil swift fram...,swiftdocc swiftdocc documentation compiler swi...
3,apple/swift,C++,c,"<img src=""https://swift.org/assets/images/swif...",img srchttpsswiftorgassetsimagesswiftsvg altsw...,img srchttpsswiftorgassetsimagesswiftsvg altsw...,img srchttpsswiftorgassetsimagesswiftsvg altsw...
4,apple/sourcekit-lsp,Swift,swift,# SourceKit-LSP\n\nSourceKit-LSP is an impleme...,sourcekitlsp sourcekitlsp implementation langu...,sourcekitlsp sourcekitlsp implement languag se...,sourcekitlsp sourcekitlsp implementation langu...
5,apple/foundationdb,C++,c,"<img alt=""FoundationDB logo"" src=""documentatio...",img altfoundationdb logo srcdocumentationfdblo...,img altfoundationdb logo srcdocumentationfdblo...,img altfoundationdb logo srcdocumentationfdblo...
6,apple/swift-protobuf,Swift,swift,"<img src=""https://swift.org/assets/images/swif...",img srchttpsswiftorgassetsimagesswiftsvg altsw...,img srchttpsswiftorgassetsimagesswiftsvg altsw...,img srchttpsswiftorgassetsimagesswiftsvg altsw...
7,apple/swift-llbuild,C++,c,llbuild\n=======\n\n*A low-level build system....,llbuild lowlevel build system llbuild set libr...,llbuild lowlevel build system llbuild set libr...,llbuild lowlevel build system llbuild set libr...
8,apple/swift-syntax,Swift,swift,# SwiftSyntax\n\nSwiftSyntax is a set of Swift...,swiftsyntax swiftsyntax set swift bindings lib...,swiftsyntax swiftsyntax set swift bind libsynt...,swiftsyntax swiftsyntax set swift binding libs...
9,apple/swift-package-manager,Swift,swift,# Swift Package Manager Project\n\nThe Swift P...,swift package manager project swift package ma...,swift packag manag project swift packag manag ...,swift package manager project swift package ma...


Check to see if the binned totals for  'other' and combining C++ & C add up...

In [37]:
df.language.value_counts()

Swift               69
Python              38
C++                 15
C                   12
JavaScript           3
LLVM                 2
Java                 2
Jupyter Notebook     2
HTML                 2
Dockerfile           1
Markdown             1
Shell                1
R                    1
Starlark             1
Name: language, dtype: int64

In [38]:
df.target.value_counts()

swift     69
python    38
c         27
other     17
Name: target, dtype: int64

Things add up... 

In [39]:
def categorise(row):  
    '''
    Function to assign a language family target category based on language pulled from query (or manyally added)
    '''
    if row['language'] == 'Swift':
        return 'swift'
    elif row['language'] == 'Python':
        return 'python'
    elif row['language'] == ('C++' or 'C'):
        return 'c'
    return 'other'

Make a new column with language families for target (this was added to the prepare file, so no need to run again)

In [12]:
# Commented out because added to prepare file
# Apply function above to create new target column
#df['target']= df.apply(lambda row: categorise(row), axis=1)

Any other nulls?

In [40]:
df.isna().sum()

repo          0
language      1
target        0
original      0
clean         0
stemmed       0
lemmatized    0
dtype: int64

In [14]:
df.shape

(151, 7)

In [15]:
# df.dropna(inplace = True)
# df.shape

In [41]:
df[df.original.isnull()]

Unnamed: 0,repo,language,target,original,clean,stemmed,lemmatized


In [42]:
df[df.language.isnull()]

Unnamed: 0,repo,language,target,original,clean,stemmed,lemmatized
123,apple/ml-transcript-translation-consistency-ra...,,other,# Human Ratings of Transcription/Translation C...,human ratings transcriptiontranslation consist...,human rate transcriptiontransl consist thi rep...,human rating transcriptiontranslation consiste...


**FIXED IN PREPARE FILE**

- Look at these 8 repos and see if there is a language that can be manually added
- Otherwise, just classigy them as other



In [43]:
nan_languages = list(df[df.language.isnull()].index)

In [44]:
for i in nan_languages:
    display(df[df.index == i])

Unnamed: 0,repo,language,target,original,clean,stemmed,lemmatized
123,apple/ml-transcript-translation-consistency-ra...,,other,# Human Ratings of Transcription/Translation C...,human ratings transcriptiontranslation consist...,human rate transcriptiontransl consist thi rep...,human rating transcriptiontranslation consiste...


Manually correct some missing values... Added to prepare.py, so commenting out

In [20]:
# # let's override the languages with the observations noted
# df.language.loc[0] = 'LLVM'
# df.language.loc[13] = 'JavaScript'
# df.language.loc[14] = 'C'
# df.language.loc[83] = 'Swift'
# df.language.loc[139] = 'Swift'
# df.language.loc[145] = 'Swift'
# df.language.loc[149] = 'Swift'

In [21]:
df[df.language.isnull()]

Unnamed: 0,repo,language,target,original,clean,stemmed,lemmatized
123,apple/ml-transcript-translation-consistency-ra...,,other,# Human Ratings of Transcription/Translation C...,human ratings transcriptiontranslation consist...,human rate transcriptiontransl consist thi rep...,human rating transcriptiontranslation consiste...


In [23]:
df[df.target.isnull()]

Unnamed: 0,repo,language,target,original,clean,stemmed,lemmatized


---

## Explore

- Make word lists per language family (swift, python, c, other)
- Look at word frequency by langage family
- Check out bigrams and trigrams
- Wordclouds and other visuals