In [1]:
import re, os
import unicodedata
import json

import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
import nltk.sentiment
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from time import strftime

from wordcloud import WordCloud

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer


from requests import get
from bs4 import BeautifulSoup

from wordcloud import WordCloud

import acquire
import acquire_jg
import prepare_jag


plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

---

## Acquire data

In [None]:
""" def get_all_repository_urls(url): # repository url
    # Get max page.
    response = get(url)
    soup = BeautifulSoup(response.text)
    print('Finding max page for repositories...')
    max_page = int(soup.find('div', role='navigation').text[-6])
    print(f'Max page found: {max_page}')
    page = 1
    repository_links = []
    print('Starting loop...')
    for n in range(max_page):
        print(n+1, 'iteration')
        print(f'Pulling data from {url}')
        # Reset soup.
        response = get(url)
        soup = BeautifulSoup(response.text)
        # Get all the repositories from the page.
        repositories = soup.find_all('a', itemprop='name codeRepository') 
        print('Fetching links for repositories...')
        for repo in repositories:
            repository_links.append(repo.get('href'))
        git = 'https://github.com/'
        next_page = soup.find('a', class_='next_page').get('href')[:-1]
        ## Use this line of code to get the url for the next page.
        if page <= 4:
            url = git + next_page + str(page + 1)
            page += 1
        else:
            return repository_links
 """

In [None]:
#repository_links = get_all_repository_urls('https://github.com/orgs/apple/repositories')

In [2]:
df = pd.read_json('data.json')

In [3]:
df.language.value_counts()

Swift               65
Python              38
C++                 15
C                   11
JavaScript           2
Java                 2
Jupyter Notebook     2
HTML                 2
Dockerfile           1
Markdown             1
Shell                1
R                    1
Starlark             1
LLVM                 1
Name: language, dtype: int64

Looks like our categories for classification will be Swift, Python, C (combined C & C++), and other. Can make a new column mapping that target.

---

## Prepare Data

- Add column for target language class
- Add column for clean, lemmatized, etc.
- Split word list by language

In [4]:
df = prepare_jag.prep_article_data(df, 'original')

In [None]:
df.shape

In [7]:
df

Unnamed: 0,repo,language,target,original,clean,stemmed,lemmatized
0,apple/llvm-project,LLVM,other,# Apple's fork of llvm-project\n\nThis is Appl...,apple ' fork llvmproject apple ' fork llvmproj...,appl ' fork llvmproject thi appl ' fork llvmpr...,apple ' fork llvmproject apple ' fork llvmproj...
1,apple/swift-argument-parser,Swift,swift,# Swift Argument Parser\n\n## Usage\n\nBegin b...,swift argument parser usage begin declaring ty...,swift argument parser usag begin declar type d...,swift argument parser usage begin declaring ty...
2,apple/swift-docc,Swift,swift,# Swift-DocC\n\nSwift-DocC is a documentation ...,swiftdocc swiftdocc documentation compiler swi...,swiftdocc swiftdocc document compil swift fram...,swiftdocc swiftdocc documentation compiler swi...
3,apple/swift,C++,c,"<img src=""https://swift.org/assets/images/swif...",img srchttpsswiftorgassetsimagesswiftsvg altsw...,img srchttpsswiftorgassetsimagesswiftsvg altsw...,img srchttpsswiftorgassetsimagesswiftsvg altsw...
4,apple/sourcekit-lsp,Swift,swift,# SourceKit-LSP\n\nSourceKit-LSP is an impleme...,sourcekitlsp sourcekitlsp implementation langu...,sourcekitlsp sourcekitlsp implement languag se...,sourcekitlsp sourcekitlsp implementation langu...
...,...,...,...,...,...,...,...
146,apple/ccs-caldavclientlibrary,HTML,other,README for CalDAVClientLibrary\n\nINTRODUCTION...,readme caldavclientlibrary introduction caldav...,readm caldavclientlibrari introduct caldavclie...,readme caldavclientlibrary introduction caldav...
147,apple/ccs-pyosxframeworks,Python,python,Getting Started\n===============\n\nThis is a ...,getting started python library wraps number us...,get start thi python librari wrap number use o...,getting started python library wrap number use...
148,apple/ccs-pysecuretransport,Python,python,Getting Started\n===============\n\nOS X Secur...,getting started os x securetransport cffi base...,get start os x securetransport cffi base api l...,getting started x securetransport cffi based a...
149,apple/swift-protobuf-test-conformance,Swift,swift,"<img src=""https://swift.org/assets/images/swif...",img srchttpsswiftorgassetsimagesswiftsvg altsw...,img srchttpsswiftorgassetsimagesswiftsvg altsw...,img srchttpsswiftorgassetsimagesswiftsvg altsw...


In [None]:
def categorise(row):  
    if row['language'] == 'Swift':
        return 'swift'
    elif row['language'] == 'Python':
        return 'python'
    elif row['language'] == ('C++' or 'C'):
        return 'c'
    return 'other'

Make a new column with language families for target (this was added to the prepare file, so no need to run again)

In [None]:
#df['target']= df.apply(lambda row: categorise(row), axis=1)

In [None]:
df.head(20)

---

## Explore

- Make word lists per language family (swift, python, c, other)
- Look at word frequency by langage family
- Check out bigrams and trigrams
- Wordclouds and other visuals

In [None]:
df[df.language.isnull()]

- Look at these 8 repos and see if there is a language that can be manually added
- Otherwise, just classigy them as other

In [None]:
nan_languages = list(df[df.language.isnull()].index)

In [None]:
for i in nan_languages:
    display(df[df.index == i])

Manually correct some missing values... Added to prepare.py, so commenting out

In [None]:
# # let's override the languages with the observations noted
# df.language.loc[0] = 'LLVM'
# df.language.loc[13] = 'JavaScript'
# df.language.loc[14] = 'C'
# df.language.loc[83] = 'Swift'
# df.language.loc[139] = 'Swift'
# df.language.loc[145] = 'Swift'
# df.language.loc[149] = 'Swift'

In [None]:
df[df.language.isnull()]

In [None]:
df.head(20)

In [None]:
df[df.target.isnull()]

Reapply the target column