In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from acquire import scrape_github_data
from prepare import words

from env import github_token, github_username

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Imports for modeling... and stuff

import pandas as pd
from prepare import basic_clean, tokenize, stem, lemmatize, remove_stopwords, prep_article_data, words
from acquire import scrape_github_data

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 250)

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

# Import Decision Tree and Random Forest ;D
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
#Run scraping function
# data = scrape_github_data()

In [3]:
#Turn scraped data into raw df
# df = pd.DataFrame(data)
df = pd.read_json('data.json')



#Check returned df 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             500 non-null    object
 1   language         470 non-null    object
 2   readme_contents  500 non-null    object
dtypes: object(3)
memory usage: 11.8+ KB


In [4]:
df = words(df)
df.columns

Index(['repo', 'language', 'readme', 'clean', 'stemmed', 'lemmatized',
       'contains_python_keywords', 'contains_cpp_keywords',
       'contains_js_keywords'],
      dtype='object')

In [5]:
df.language.value_counts()

other         277
Python        104
JavaScript     78
C++            41
Name: language, dtype: int64

### Modeling prep

In [7]:
# Make copies of df with prepared columns and target
clean_df = df.copy()[['language', 'clean']]
stem_df = df.copy()[['language', 'stemmed']]
lem_df = df.copy()[['language', 'lemmatized']]


# Get splits for each of the above dfs and isolate target
X_clean = clean_df[['clean']]
y_clean = clean_df.language

X_clean_train, X_clean_test, y_clean_train, y_clean_test = train_test_split(X_clean, y_clean, test_size=.2, random_state=302)
X_clean_train, X_clean_validate, y_clean_train, y_clean_validate  = train_test_split(X_clean_train, y_clean_train, test_size=.3, random_state=302)

print(X_clean_train.shape, X_clean_validate.shape, X_clean_test.shape)

X_stem = stem_df[['stemmed']]
y_stem = stem_df.language

X_stem_train, X_stem_test, y_stem_train, y_stem_test = train_test_split(X_stem, y_stem, test_size=.2, random_state=302)
X_stem_train, X_stem_validate, y_stem_train, y_stem_validate  = train_test_split(X_stem_train, y_stem_train, test_size=.3, random_state=302)

print(X_stem_train.shape, X_stem_validate.shape, X_stem_test.shape)

X_lem = lem_df[['lemmatized']]
y_lem = lem_df.language

X_lem_train, X_lem_test, y_lem_train, y_lem_test = train_test_split(X_lem, y_lem, test_size=.2, random_state=302)
X_lem_train, X_lem_validate, y_lem_train, y_lem_validate  = train_test_split(X_lem_train, y_lem_train, test_size=.3, random_state=302)

print(X_lem_train.shape, X_lem_validate.shape, X_lem_test.shape)




(280, 1) (120, 1) (100, 1)
(280, 1) (120, 1) (100, 1)
(280, 1) (120, 1) (100, 1)


### Modeling

In [8]:
# "Clean" models
cv_clean = CountVectorizer()
tfidf_clean = TfidfVectorizer()

cv_clean_bow = cv_clean.fit_transform(X_clean_train.clean)

tf_clean_bow = tfidf_clean.fit_transform(X_clean_train.clean)


# Check cv bag of words
cv_clean_bow.todense()


matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [9]:
# Check tfidf bag of words
tf_clean_bow.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
# Make and fit decision tree object for cv_clean_bow
cv_tree1 = DecisionTreeClassifier(max_depth=5)
cv_tree1.fit(cv_clean_bow, y_clean_train)

#Make and fit decision tree object for tf_clean_bow
tf_tree1 = DecisionTreeClassifier(max_depth=5)
tf_tree1.fit(tf_clean_bow, y_clean_train)

#Output tree scores
print(f'CV tree 1 score: {cv_tree1.score(cv_clean_bow, y_clean_train)}')
print(f'TF IDF tree 1 score: {tf_tree1.score(tf_clean_bow, y_clean_train)}')

CV tree 1 score: 0.825
TF IDF tree 1 score: 0.8464285714285714


In [11]:
# "Stemmed" models
cv_stem = CountVectorizer()
tfidf_stem = TfidfVectorizer()

# Bags
cv_stem_bow = cv_stem.fit_transform(X_stem_train.stemmed)
tf_stem_bow = tfidf_stem.fit_transform(X_stem_train.stemmed)

# Make and fit decision tree object for cv_stem_bow
cv_tree2 = DecisionTreeClassifier(max_depth=5)
cv_tree2.fit(cv_stem_bow, y_stem_train)

#Make and fit decision tree object for tf_stem_bow
tf_tree2 = DecisionTreeClassifier(max_depth=5)
tf_tree2.fit(tf_stem_bow, y_stem_train)


#Get tree score
print(f'CV tree 2 score: {cv_tree2.score(cv_stem_bow, y_stem_train)}')
#Get tree score
print(f'TF IDF tree 2 score: {tf_tree2.score(tf_stem_bow, y_stem_train)}')

CV tree 2 score: 0.8214285714285714
TF IDF tree 2 score: 0.85


In [12]:
# "Lemmatized" models
cv_lem = CountVectorizer()
tfidf_lem = TfidfVectorizer()

cv_lem_bow = cv_lem.fit_transform(X_lem_train.lemmatized)
tf_lem_bow = tfidf_lem.fit_transform(X_lem_train.lemmatized)

# Make and fit decision tree object for cv_lem_bow
cv_tree3 = DecisionTreeClassifier(max_depth=5)
cv_tree3.fit(cv_lem_bow, y_lem_train)

#Make and fit decision tree object for tf_lem_bow
tf_tree3 = DecisionTreeClassifier(max_depth=5)
tf_tree3.fit(tf_lem_bow, y_lem_train)

# Output tree scores
print(f'CV tree score: {cv_tree3.score(cv_lem_bow, y_lem_train)}') 
print(f'TFIDF tree score: {tf_tree3.score(tf_lem_bow, y_lem_train)}')

CV tree score: 0.825
TFIDF tree score: 0.8392857142857143


In [13]:
dec_tree_training_scores= {
    'CV_clean': cv_tree1.score(cv_clean_bow, y_clean_train),
    'CV_stem': cv_tree2.score(cv_stem_bow, y_stem_train),
    'CV_lem': cv_tree3.score(cv_lem_bow, y_lem_train),
    'TF/IDF_clean': tf_tree1.score(tf_clean_bow, y_clean_train),
    'TF/IDF_stem': tf_tree2.score(tf_stem_bow, y_stem_train),
    'TF/IDF_lem': tf_tree3.score(tf_lem_bow, y_lem_train)
}

dec_tree_training_scores
#pd.DataFrame(dec_tree_training_scores, index=)

{'CV_clean': 0.825,
 'CV_stem': 0.8214285714285714,
 'CV_lem': 0.825,
 'TF/IDF_clean': 0.8464285714285714,
 'TF/IDF_stem': 0.85,
 'TF/IDF_lem': 0.8392857142857143}

### Takeaways: Both vectorizers had scored the same on clean, stemmed, and lemmatized preparations of README text in previous evaluations. Current version of text preparations have models using TF/IDF vectorizers scoring higher on train split.

### Update (16-May-2022): Data increased from 100 to 500 records. TF/IDF vectorizers still scoring higher.

In [14]:
#Transform validate split with vectorizer
tf_clean_bow_val = tfidf_clean.transform(X_clean_validate.clean)

#Get tf_tree1 score on validate
tf_tree1.score(tf_clean_bow_val, y_clean_validate)

0.75

In [15]:
#Transform validate split with vectorizer
tf_stem_bow_val = tfidf_stem.transform(X_stem_validate.stemmed)

#Get tf_tree2 score on validate
tf_tree2.score(tf_stem_bow_val, y_stem_validate)

0.75

In [16]:
#Transform validate split with vectorizer
tf_lem_bow_val = tfidf_lem.transform(X_lem_validate.lemmatized)


#Get tf_tree3 score on validate
tf_tree3.score(tf_lem_bow_val, y_lem_validate)

0.7666666666666667

### Random Forest models, 100 estimators, max depth 15

In [18]:
# Make vectorizer objects
cv_clean2 = CountVectorizer()
tfidf_clean2 = TfidfVectorizer()

#Fit vectorizer objects
cv_clean2_bow = cv_clean2.fit_transform(X_clean_train.clean)
tf_clean2_bow = tfidf_clean2.fit_transform(X_clean_train.clean)

# Make and fit random forest object for cv_clean2_bow
cv_rf1 = RandomForestClassifier(n_estimators= 100, max_depth=15)
cv_rf1.fit(cv_clean2_bow, y_clean_train)

#Make and fit random forest object for tf_clean2_bow
tf_rf1 = RandomForestClassifier(n_estimators=100, max_depth=15)
tf_rf1.fit(tf_clean2_bow, y_clean_train)

#Output rf scores
print(f'CV RF 1 score: {cv_rf1.score(cv_clean2_bow, y_clean_train)}')
print(f'TF IDF RF 1 score: {tf_rf1.score(tf_clean2_bow, y_clean_train)}')

# Make vectorizer objects
cv_stem2 = CountVectorizer()
tfidf_stem2 = TfidfVectorizer()

#Fit vectorizer objects
cv_stem2_bow = cv_stem2.fit_transform(X_stem_train.stemmed)
tf_stem2_bow = tfidf_stem2.fit_transform(X_stem_train.stemmed)

# Make and fit random object for cv_stem2_bow
cv_rf2 = RandomForestClassifier(n_estimators= 100, max_depth=15)
cv_rf2.fit(cv_stem2_bow, y_stem_train)

#Make and fit decision rf object for tf_stem2_bow
tf_rf2 = RandomForestClassifier(n_estimators=100, max_depth=15)
tf_rf2.fit(tf_stem2_bow, y_stem_train)

#Output rf scores
print(f'CV RF 2 score: {cv_rf2.score(cv_stem2_bow, y_stem_train)}')
print(f'TF IDF RF 2 score: {tf_rf2.score(tf_stem2_bow, y_stem_train)}')
# Make vectorizer objects
cv_lem2 = CountVectorizer()
tfidf_lem2 = TfidfVectorizer()

#Fit vectorizer objects
cv_lem2_bow = cv_lem2.fit_transform(X_lem_train.lemmatized)
tf_lem2_bow = tfidf_lem2.fit_transform(X_lem_train.lemmatized)

# Make and fit decision tree object for cv_lem2_bow
cv_rf3 = RandomForestClassifier(n_estimators= 100, max_depth=15)
cv_rf3.fit(cv_lem2_bow, y_lem_train)

#Make and fit decision rf object for tf_lem2_bow
tf_rf3 = RandomForestClassifier(n_estimators=100, max_depth=15)
tf_rf3.fit(tf_lem2_bow, y_lem_train)

#Output rf scores
print(f'CV RF 3 score: {cv_rf3.score(cv_lem2_bow, y_lem_train)}')
print(f'TF IDF RF 3 score: {tf_rf3.score(tf_lem2_bow, y_lem_train)}')

CV RF 1 score: 0.8035714285714286
TF IDF RF 1 score: 0.8428571428571429
CV RF 2 score: 0.8285714285714286
TF IDF RF 2 score: 0.8821428571428571
CV RF 3 score: 0.8035714285714286
TF IDF RF 3 score: 0.85


### Both vectorizers scoring the same on training split for all text preparations. All move on to validate.

### Update (16-May-2022 11:15) TF/IDF vectorizers score higher than count vectorizers

In [19]:
# Transform count vectorizers on validate
#cv_clean2_bow_val = cv_clean2.transform(X_clean_validate.clean)
#cv_stem2_bow_val = cv_stem2.transform(X_stem_validate.stemmed)
#cv_lem2_bow_val = cv_lem2.transform(X_lem_validate.lemmatized)

# Output CV RF scores
#print(f'CV RF 1 score on validate: {cv_rf1.score(cv_clean2_bow_val, y_clean_validate)}')
#print(f'CV RF 2 score on validate: {cv_rf2.score(cv_stem2_bow_val, y_stem_validate)}')
#print(f'CV RF 3 score on validate: {cv_rf3.score(cv_lem2_bow_val, y_lem_validate)}')

# Transform TF IDF vectorizers on validate
tf_clean2_bow_val = tfidf_clean2.transform(X_clean_validate.clean)
tf_stem2_bow_val = tfidf_stem2.transform(X_stem_validate.stemmed)
tf_lem2_bow_val = tfidf_lem2.transform(X_lem_validate.lemmatized)

# Output TF IDF RF scores
print(f'TF IDF RF 3 score on validate: {tf_rf1.score(tf_clean2_bow_val, y_clean_validate)}')
print(f'TF IDF RF 3 score on validate: {tf_rf2.score(tf_stem2_bow_val, y_stem_validate)}')
print(f'TF IDF RF 3 score on validate: {tf_rf3.score(tf_lem2_bow_val, y_lem_validate)}')

TF IDF RF 3 score on validate: 0.675
TF IDF RF 3 score on validate: 0.7
TF IDF RF 3 score on validate: 0.675


### Lemmatized text prep for both vectorizers score highest, but TF IDF outscores CV. Significant drop off for both vectorizers on all text preparations, though. Random Forest model using TF IDF vectorizer on lemmatized text moves on to test data.

In [21]:
# Transform TF IDF vectorizer on test 
tf_stem2_bow_test = tfidf_lem2.transform(X_stem_test.stemmed)

# Output model's score on test
print(f'TF IDF RF 3 score on validate: {tf_rf3.score(tf_stem2_bow_test, y_stem_test)}')


TF IDF RF 3 score on validate: 0.52


### Random Forest model (100 estimators, max depth 15) using TD IDF vectorizer on lemmatized text is 45% accurate on unseen data. (10% better than Decision Tree tested on previous version of data; on current version Random Forest and Decision Tree model both score 45%.)

### Update (16-May-2022 11:12) Increased accuracy on larger dataset to 52%.

### Naive Bayes models, alpha 0.5

In [22]:
from sklearn.naive_bayes import CategoricalNB

# Make vectorizer objects
cv_clean3 = CountVectorizer()
tfidf_clean3 = TfidfVectorizer()

#Fit vectorizer objects
cv_clean3_bow = cv_clean3.fit_transform(X_clean_train.clean)
tf_clean3_bow = tfidf_clean3.fit_transform(X_clean_train.clean)

# Make and fit Naive Bayes object for cv_clean3_bow
cv_nb1 = CategoricalNB(alpha=0.5, min_categories=cv_clean3_bow.toarray().shape[1]) # min_categories kwarg to ensure model is fed a consistent number of categories
cv_nb1.fit(cv_clean3_bow.toarray(), y_clean_train) # Naive Bayes requires dense data

#Make and fit Naive Bayes object for tf_clean3_bow
tf_nb1 = CategoricalNB(alpha=0.5, min_categories=cv_clean3_bow.toarray().shape[1])
tf_nb1.fit(tf_clean3_bow.toarray(), y_clean_train) 

#Output NB scores
print(f'CV NB 1 score: {cv_nb1.score(cv_clean3_bow.toarray(), y_clean_train)}')
print(f'TF IDF NB 1 score: {tf_nb1.score(tf_clean3_bow.toarray(), y_clean_train)}')

# Make vectorizer objects
cv_stem3 = CountVectorizer()
tfidf_stem3 = TfidfVectorizer()

#Fit vectorizer objects
cv_stem3_bow = cv_stem3.fit_transform(X_stem_train.stemmed)
tf_stem3_bow = tfidf_stem3.fit_transform(X_stem_train.stemmed)

# Make and fit Naive Bayes object for cv_stem3_bow
cv_nb2 = CategoricalNB(alpha=0.5, min_categories=cv_stem3_bow.toarray().shape[1])
cv_nb2.fit(cv_stem3_bow.toarray(), y_stem_train)

#Make and fit Naive Bayes object for tf_stem3_bow
tf_nb2 = CategoricalNB(alpha=0.5, min_categories=cv_stem3_bow.toarray().shape[1])
tf_nb2.fit(tf_stem3_bow.toarray(), y_stem_train)

#Output NB scores
print(f'CV NB 2 score: {cv_nb2.score(cv_stem3_bow.toarray(), y_stem_train)}')
print(f'TF IDF NB 2 score: {tf_nb2.score(tf_stem3_bow.toarray(), y_stem_train)}')
# Make vectorizer objects
cv_lem3 = CountVectorizer()
tfidf_lem3 = TfidfVectorizer()

#Fit vectorizer objects
cv_lem3_bow = cv_lem3.fit_transform(X_lem_train.lemmatized)
tf_lem3_bow = tfidf_lem3.fit_transform(X_lem_train.lemmatized)

# Make and fit Naive Bayes object for cv_lem3_bow
cv_nb3 = CategoricalNB(alpha=0.5, min_categories=cv_lem3_bow.toarray().shape[1])
cv_nb3.fit(cv_lem3_bow.toarray(), y_lem_train)

#Make and fit Naive Bayes object for tf_lem3_bow
tf_nb3 = CategoricalNB(alpha=0.5, min_categories=cv_lem3_bow.toarray().shape[1])
tf_nb3.fit(tf_lem3_bow.toarray(), y_lem_train)

#Output NB scores
print(f'CV NB 3 score: {cv_nb3.score(cv_lem3_bow.toarray(), y_lem_train)}')
print(f'TF IDF NB 3 score: {tf_nb3.score(tf_lem3_bow.toarray(), y_lem_train)}')


CV NB 1 score: 0.525
TF IDF NB 1 score: 0.525


: 

: 

### Naive Bayes models score significantly lower than DT and RF on training data. Equal scores across text preparations and vectorizers.

In [23]:
from pprint import pprint

# Transform count vectorizers on validate
cv_clean3_bow_val = cv_clean3.transform(X_clean_validate.clean)
cv_stem3_bow_val = cv_stem3.transform(X_stem_validate.stemmed)
cv_lem3_bow_val = cv_lem3.transform(X_lem_validate.lemmatized)

tf_clean3_bow_val = tfidf_clean3.transform(X_clean_validate.clean)
tf_stem3_bow_val = tfidf_stem3.transform(X_stem_validate.stemmed)
tf_lem3_bow_val = tfidf_lem3.transform(X_lem_validate.lemmatized)

# Output CV NB scores
print(f'CV NB 1 score on validate: {cv_nb1.score(cv_clean3_bow_val.toarray(), y_clean_validate)}')
print(f'CV NB 2 score on validate: {cv_nb2.score(cv_stem3_bow_val.toarray(), y_stem_validate)}')
print(f'CV NB 3 score on validate: {cv_nb3.score(cv_lem3_bow_val.toarray(), y_lem_validate)}')

# Output CV NB scores
print(f'TF/IDF NB 1 score on validate: {tf_nb1.score(tf_clean3_bow_val.toarray(), y_clean_validate)}')
print(f'TF/IDF NB 2 score on validate: {tf_nb2.score(tf_stem3_bow_val.toarray(), y_stem_validate)}')
print(f'TF/IDF NB 3 score on validate: {tf_nb3.score(tf_lem3_bow_val.toarray(), y_lem_validate)}')

CV NB 1 score on validate: 0.5833333333333334
CV NB 2 score on validate: 0.5833333333333334
CV NB 3 score on validate: 0.5833333333333334
TF/IDF NB 1 score on validate: 0.5833333333333334
TF/IDF NB 2 score on validate: 0.5833333333333334
TF/IDF NB 3 score on validate: 0.5833333333333334


In [24]:
print(f'CV clean BOW on validate: {cv_clean3_bow_val.toarray().shape}')
print(f'CV stem BOW on validate: {cv_stem3_bow_val.toarray().shape}')
print(f'CV lem BOW on validate: {cv_lem3_bow_val.toarray().shape}')
print(f'TF/IDF clean BOW on validate: {tf_clean3_bow_val.toarray().shape}')
print(f'TF/IDF stem BOW on validate: {tf_stem3_bow_val.toarray().shape}')
print(f'TF/IDF lem BOW on validate: {tf_lem3_bow_val.toarray().shape}')

CV clean BOW on validate: (24, 5892)
CV stem BOW on validate: (24, 4954)
CV lem BOW on validate: (24, 5547)
TF/IDF clean BOW on validate: (24, 5892)
TF/IDF stem BOW on validate: (24, 4954)
TF/IDF lem BOW on validate: (24, 5547)


In [46]:
cv_clean3_bow.toarray().shape[1]

5892

In [39]:
cv_clean3_bow

<56x5892 sparse matrix of type '<class 'numpy.int64'>'
	with 11246 stored elements in Compressed Sparse Row format>

### Make functions to speed up modeling process

In [62]:
#def get_cv_dt_score (text_data, target, max_depth, ngram_range=False, train_split=True):
    #'''
    #Takes in text_data, a target variable, an optional ngram range argument (min_n, max_n) for a Count Vectorizer object, and a max_depth kwarg for a Decision Tree,
    #and returns a bag of words (bow), the Decision Tree object and its accuracy score on the train split.
    #'''
    #cv = CountVectorizer(ngram_range)
    #bow = cv.fit_transform(text_data)
    #tree = DecisionTreeClassifier(max_depth=max_depth)
    #tree.fit(bow, target)
    
    #if train_split:
        #dt_score = tree.score(bow, target)
        #return tree, bow, dt_score
    #else:
        #print('Use tree and bow returned from train split to get score on validate/test split.')
    


In [33]:
#def get_tfidf_dt_score (text_data, target, depth, train_split=True):
    #'''
    #Takes in text_data, a target variable, and a depth(max_depth) kwarg for a Decision Tree model,
    #and returns the Decision Tree object and an accompanying accuracy score for the model.
    #'''
    #tfidf = TfidfVectorizer()
    #bow = tfidf.fit_transform(text_data)
    #tree = DecisionTreeClassifier(max_depth=depth)
    #tree.fit(bow, target)

    #if train_split:
        #dt_score = tree.score(bow, target)
    #else:
        #raise Exception('')

    #return tree, dt_score


In [None]:
#def get_cv_rf_score(text_data, target, depth, n_estimators, train_split=True):
    #'''
    #Takes in text data, a target variable, depth and n_estimators kwargs for a Random Forest model,
    #and returns the Random Forest object and an accompanying accuracy score for the model.
    #'''
    #cv = CountVectorizer()
    #bow = cv.fit_transform(text_data)
    #rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=depth)
    #rf.fit(bow, target)
    
    #if train_split:
        #rf_score = rf.score(bow, target)
    #else:
        #bow = cv.transform(text_data)
        #rf_score = rf.score(bow, target)
    
    #return rf, rf_score