In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd

import matplotlib.pyplot as plt

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix

import prepare_readme as p

In [2]:
df = p.prep_readme()

In [14]:
df.head()

Unnamed: 0,readme_contents_stemmed,language
0,welcom warn book finish still work chapter com...,CSS
1,monorepo deeplearn j welcom new monorepo deepl...,Java
2,h join chat http gitter im h oai h http badg g...,Java
3,div align center img src http www tensorflow o...,C++
4,grpc rpc librari framework grpc modern open so...,C++


In [3]:
df_v = df.language.value_counts(dropna=False)

In [4]:
labels = pd.concat([df.language.value_counts(),
                    df.language.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels

Unnamed: 0,n,percent
HTML,20,0.190476
C++,20,0.190476
Python,15,0.142857
JavaScript,10,0.095238
Java,10,0.095238
Shell,10,0.095238
CSS,10,0.095238
C,10,0.095238


In [None]:
# words that are programming languages only
#verterize ngrams

In [30]:
df.readme_contents_stemmed.apply(clean)

0      [welcom, warn, book, finish, still, work, chap...
1      [monorepo, deeplearn, j, welcom, new, monorepo...
2      [h, join, chat, http, gitter, im, h, oai, h, h...
3      [div, align, center, img, src, http, www, tens...
4      [grpc, rpc, librari, framework, grpc, modern, ...
5      [trader, trader, p, align, center, img, src, h...
6      [toaruo, toaruo, hobbyist, educ, unix, like, o...
7      [jumpserv, python, http, img, shield, io, badg...
8      [img, src, http, github, com, dianp, cat, raw,...
9      [simpl, python, version, manag, pyenv, join, c...
10     [kjyw, kjyw, shell, nginxmysqlphpredisnagio, l...
11     [div, align, center, href, http, gitstar, rank...
12     [p, align, center, href, http, gdbgui, com, im...
13     [openag, asset, logo, banner, png, http, opena...
14     [japronto, irc, japronto, http, img, shield, i...
15     [p, align, center, img, src, http, raw, github...
16     [js, beautifi, build, statu, http, dev, azur, ...
17     [kodi, logo, doc, resour

In [49]:
def get_words(text):
    
    words = re.sub(r'[^\w\s]', '', text).split()
    return words

In [52]:
def get_ngrams(n=2):

    ngram_master_list = []

    for row in df.readme_contents_stemmed.apply(get_words):

        bigrams = nltk.ngrams(row, n)

        ngram_master_list.extend(bigrams)
    
    return ngram_master_list

In [55]:
def get_ngram_strings(ngrams):
    
    string_list = []

    for pair in ngrams:
    
        first = pair[0]

        second = pair[1]
    
        string_list.append(f"{first} {second}")
        
    return string_list

In [56]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(get_ngram_strings(get_ngrams(n=2)))
y = df.language

In [13]:
HTML_list =[]

for lst in df.readme_contents_stemmed.apply(p.tokenize)[df.language=='HTML']:
    
    HTML_list.extend(lst)
    
print(HTML_list)

['note', 'creat', 'releas', 'bumpvers', 'major', 'minor', 'patch', 'git', 'push', 'git', 'push', 'tag', 'python', 'setup', 'py', 'sdist', 'upload', 'convert', 'releas', 'http', 'github', 'com', 'deanmalmgren', 'textract', 'releas', 'textract', 'extract', 'text', 'document', 'muss', 'fuss', 'full', 'document', 'http', 'textract', 'readthedoc', 'org', 'build', 'statu', 'version', 'download', 'test', 'coverag', 'document', 'statu', 'updat', 'star', 'fork', 'build', 'statu', 'imag', 'http', 'travi', 'ci', 'org', 'deanmalmgren', 'textract', 'svg', 'branch', 'master', 'target', 'http', 'travi', 'ci', 'org', 'deanmalmgren', 'textract', 'version', 'imag', 'http', 'img', 'shield', 'io', 'pypi', 'v', 'textract', 'svg', 'target', 'http', 'warehous', 'python', 'org', 'project', 'textract', 'download', 'imag', 'http', 'img', 'shield', 'io', 'pypi', 'dm', 'textract', 'svg', 'target', 'http', 'warehous', 'python', 'org', 'project', 'textract', 'test', 'coverag', 'imag', 'http', 'coveral', 'io', 'repo

In [6]:
def tokenize(article):
    '''
    tokenizes words in a string
    '''

    # create token object
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # use object to tokenize string
    article = tokenizer.tokenize(article, return_str=True)
    
    return article

#new svc or decition tree

In [7]:
# restrict df to target and predicted veriables
df = df[['readme_contents_stemmed','language']]

In [8]:
# vectorize stemmed readme contents and assign X any y veriables
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.readme_contents_stemmed)
y = df.language

In [9]:
# split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=.2)

In [10]:
# create classifier object and fit it to the data
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)
clf.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')