In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from acquire import scrape_github_data
from prepare import words

from env import github_token, github_username

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Imports for modeling... and stuff

import pandas as pd
from prepare import basic_clean, tokenize, stem, lemmatize, remove_stopwords, prep_article_data, words
from acquire import scrape_github_data

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 250)

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

# Import Decision Tree and Random Forest ;D
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
#Run scraping function
# data = scrape_github_data()

In [5]:
#Turn scraped data into raw df
# df = pd.DataFrame(data)
df = pd.read_json('data.json')



#Check returned df 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             100 non-null    object
 1   language         96 non-null     object
 2   readme_contents  100 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [6]:
df = words(df)

In [10]:
df.language.value_counts()

other         53
Python        18
C++           17
JavaScript    12
Name: language, dtype: int64

### Modeling prep

In [7]:
# Make copies of df with prepared columns and target
clean_df = df.copy()[['language', 'clean']]
stem_df = df.copy()[['language', 'stemmed']]
lem_df = df.copy()[['language', 'lemmatized']]


# Get splits for each of the above dfs and isolate target
X_clean = clean_df[['clean']]
y_clean = clean_df.language

X_clean_train, X_clean_test, y_clean_train, y_clean_test = train_test_split(X_clean, y_clean, test_size=.2, random_state=302)
X_clean_train, X_clean_validate, y_clean_train, y_clean_validate  = train_test_split(X_clean_train, y_clean_train, test_size=.3, random_state=302)

print(X_clean_train.shape, X_clean_validate.shape, X_clean_test.shape)

X_stem = stem_df[['stemmed']]
y_stem = stem_df.language

X_stem_train, X_stem_test, y_stem_train, y_stem_test = train_test_split(X_stem, y_stem, test_size=.2, random_state=302)
X_stem_train, X_stem_validate, y_stem_train, y_stem_validate  = train_test_split(X_stem_train, y_stem_train, test_size=.3, random_state=302)

print(X_stem_train.shape, X_stem_validate.shape, X_stem_test.shape)

X_lem = lem_df[['lemmatized']]
y_lem = lem_df.language

X_lem_train, X_lem_test, y_lem_train, y_lem_test = train_test_split(X_lem, y_lem, test_size=.2, random_state=302)
X_lem_train, X_lem_validate, y_lem_train, y_lem_validate  = train_test_split(X_lem_train, y_lem_train, test_size=.3, random_state=302)

print(X_lem_train.shape, X_lem_validate.shape, X_lem_test.shape)




(56, 1) (24, 1) (20, 1)
(56, 1) (24, 1) (20, 1)
(56, 1) (24, 1) (20, 1)


### Modeling

In [8]:
# "Clean" models
cv_clean = CountVectorizer()
tfidf_clean = TfidfVectorizer()

cv_clean_bow = cv_clean.fit_transform(X_clean_train.clean)

tf_clean_bow = tfidf_clean.fit_transform(X_clean_train.clean)


# Check cv bag of words
cv_clean_bow.todense()


matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [9]:
# Check tfidf bag of words
tf_clean_bow.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
# Make and fit decision tree object for cv_clean_bow
cv_tree1 = DecisionTreeClassifier(max_depth=5)
cv_tree1.fit(cv_clean_bow, y_clean_train)

#Make and fit decision tree object for tf_clean_bow
tf_tree1 = DecisionTreeClassifier(max_depth=5)
tf_tree1.fit(tf_clean_bow, y_clean_train)

#Output tree scores
print(f'CV tree 1 score: {cv_tree1.score(cv_clean_bow, y_clean_train)}')
print(f'TF IDF tree 1 score: {tf_tree1.score(tf_clean_bow, y_clean_train)}')

CV tree 1 score: 0.8571428571428571
TF IDF tree 1 score: 0.875


In [11]:
# "Stemmed" models
cv_stem = CountVectorizer()
tfidf_stem = TfidfVectorizer()

# Bags
cv_stem_bow = cv_stem.fit_transform(X_stem_train.stemmed)
tf_stem_bow = tfidf_stem.fit_transform(X_stem_train.stemmed)

# Make and fit decision tree object for cv_stem_bow
cv_tree2 = DecisionTreeClassifier(max_depth=5)
cv_tree2.fit(cv_stem_bow, y_stem_train)

#Make and fit decision tree object for tf_stem_bow
tf_tree2 = DecisionTreeClassifier(max_depth=5)
tf_tree2.fit(tf_stem_bow, y_stem_train)


#Get tree score
print(f'CV tree 2 score: {cv_tree2.score(cv_stem_bow, y_stem_train)}')
#Get tree score
print(f'TF IDF tree 2 score: {tf_tree2.score(tf_stem_bow, y_stem_train)}')

CV tree 2 score: 0.8571428571428571
TF IDF tree 2 score: 0.8928571428571429


In [12]:
# "Lemmatized" models
cv_lem = CountVectorizer()
tfidf_lem = TfidfVectorizer()

cv_lem_bow = cv_lem.fit_transform(X_lem_train.lemmatized)
tf_lem_bow = tfidf_lem.fit_transform(X_lem_train.lemmatized)

# Make and fit decision tree object for cv_lem_bow
cv_tree3 = DecisionTreeClassifier(max_depth=5)
cv_tree3.fit(cv_lem_bow, y_lem_train)

#Make and fit decision tree object for tf_lem_bow
tf_tree3 = DecisionTreeClassifier(max_depth=5)
tf_tree3.fit(tf_lem_bow, y_lem_train)

# Output tree scores
print(f'CV tree score: {cv_tree3.score(cv_lem_bow, y_lem_train)}') 
print(f'TFIDF tree score: {tf_tree3.score(tf_lem_bow, y_lem_train)}')

CV tree score: 0.8571428571428571
TFIDF tree score: 0.9285714285714286


In [13]:
dec_tree_training_scores= {
    'CV_clean': cv_tree1.score(cv_clean_bow, y_clean_train),
    'CV_stem': cv_tree2.score(cv_stem_bow, y_stem_train),
    'CV_lem': cv_tree3.score(cv_lem_bow, y_lem_train),
    'TFIDF_clean': tf_tree1.score(tf_clean_bow, y_clean_train),
    'TFIDF_stem': tf_tree2.score(tf_stem_bow, y_stem_train),
    'TFIDF_lem': tf_tree3.score(tf_lem_bow, y_lem_train)
}

dec_tree_training_scores
#pd.DataFrame(dec_tree_training_scores, index=)

{'CV_clean': 0.8571428571428571,
 'CV_stem': 0.8571428571428571,
 'CV_lem': 0.8571428571428571,
 'TFIDF_clean': 0.875,
 'TFIDF_stem': 0.8928571428571429,
 'TFIDF_lem': 0.9285714285714286}

### Takeaways: Both vectorizers had scored the same on clean, stemmed, and lemmatized preparations of README text in previous evaluations.

In [14]:
#Transform validate split with vectorizer
tf_clean_bow_val = tfidf_clean.transform(X_clean_validate.clean)

#Get tf_tree1 score on validate
tf_tree1.score(tf_clean_bow_val, y_clean_validate)

0.625

In [15]:
#Transform validate split with vectorizer
tf_stem_bow_val = tfidf_stem.transform(X_stem_validate.stemmed)

#Get tf_tree2 score on validate
tf_tree2.score(tf_stem_bow_val, y_stem_validate)

0.375

In [16]:
#Transform validate split with vectorizer
tf_lem_bow_val = tfidf_lem.transform(X_lem_validate.lemmatized)


#Get tf_tree3 score on validate
tf_tree3.score(tf_lem_bow_val, y_lem_validate)

0.5416666666666666

### Random Forest models, 100 estimators, max depth 15

In [17]:
# Make vectorizer objects
cv_clean2 = CountVectorizer()
tfidf_clean2 = TfidfVectorizer()

#Fit vectorizer objects
cv_clean2_bow = cv_clean2.fit_transform(X_clean_train.clean)
tf_clean2_bow = tfidf_clean2.fit_transform(X_clean_train.clean)

# Make and fit random forest object for cv_clean2_bow
cv_rf1 = RandomForestClassifier(n_estimators= 100, max_depth=15)
cv_rf1.fit(cv_clean2_bow, y_clean_train)

#Make and fit random forest object for tf_clean2_bow
tf_rf1 = RandomForestClassifier(n_estimators=100, max_depth=15)
tf_rf1.fit(tf_clean2_bow, y_clean_train)

#Output rf scores
print(f'CV RF 1 score: {cv_rf1.score(cv_clean2_bow, y_clean_train)}')
print(f'TF IDF RF 1 score: {tf_rf1.score(tf_clean2_bow, y_clean_train)}')

# Make vectorizer objects
cv_stem2 = CountVectorizer()
tfidf_stem2 = TfidfVectorizer()

#Fit vectorizer objects
cv_stem2_bow = cv_stem2.fit_transform(X_stem_train.stemmed)
tf_stem2_bow = tfidf_stem2.fit_transform(X_stem_train.stemmed)

# Make and fit decision tree object for cv_stem2_bow
cv_rf2 = RandomForestClassifier(n_estimators= 100, max_depth=15)
cv_rf2.fit(cv_stem2_bow, y_stem_train)

#Make and fit decision rf object for tf_stem2_bow
tf_rf2 = RandomForestClassifier(n_estimators=100, max_depth=15)
tf_rf2.fit(tf_stem2_bow, y_stem_train)

#Output rf scores
print(f'CV RF 2 score: {cv_rf2.score(cv_stem2_bow, y_stem_train)}')
print(f'TF IDF RF 2 score: {tf_rf2.score(tf_stem2_bow, y_stem_train)}')
# Make vectorizer objects
cv_lem2 = CountVectorizer()
tfidf_lem2 = TfidfVectorizer()

#Fit vectorizer objects
cv_lem2_bow = cv_lem2.fit_transform(X_lem_train.lemmatized)
tf_lem2_bow = tfidf_lem2.fit_transform(X_lem_train.lemmatized)

# Make and fit decision tree object for cv_lem2_bow
cv_rf3 = RandomForestClassifier(n_estimators= 100, max_depth=15)
cv_rf3.fit(cv_lem2_bow, y_lem_train)

#Make and fit decision rf object for tf_lem2_bow
tf_rf3 = RandomForestClassifier(n_estimators=100, max_depth=15)
tf_rf3.fit(tf_lem2_bow, y_lem_train)

#Output rf scores
print(f'CV RF 3 score: {cv_rf3.score(cv_lem2_bow, y_lem_train)}')
print(f'TF IDF RF 3 score: {tf_rf3.score(tf_lem2_bow, y_lem_train)}')

CV RF 1 score: 0.9821428571428571
TF IDF RF 1 score: 0.9821428571428571


### Both vectorizers scoring the same on training split for all text preparations. All move on to validate.

In [22]:
# Transform count vectorizers on validate
cv_clean2_bow_val = cv_clean2.transform(X_clean_validate.clean)
cv_stem2_bow_val = cv_stem2.transform(X_stem_validate.stemmed)
cv_lem2_bow_val = cv_lem2.transform(X_lem_validate.lemmatized)

# Output CV RF scores
print(f'CV RF 1 score on validate: {cv_rf1.score(cv_clean2_bow_val, y_clean_validate)}')
print(f'CV RF 2 score on validate: {cv_rf2.score(cv_stem2_bow_val, y_stem_validate)}')
print(f'CV RF 3 score on validate: {cv_rf3.score(cv_lem2_bow_val, y_lem_validate)}')

# Transform TF IDF vectorizers on validate
tf_clean2_bow_val = tfidf_clean2.transform(X_clean_validate.clean)
tf_stem2_bow_val = tfidf_stem2.transform(X_stem_validate.stemmed)
tf_lem2_bow_val = tfidf_lem2.transform(X_lem_validate.lemmatized)

# Output TF IDF RF scores
print(f'TF IDF RF 3 score on validate: {tf_rf1.score(tf_clean2_bow_val, y_clean_validate)}')
print(f'TF IDF RF 3 score on validate: {tf_rf2.score(tf_stem2_bow_val, y_stem_validate)}')
print(f'TF IDF RF 3 score on validate: {tf_rf3.score(tf_lem2_bow_val, y_lem_validate)}')

CV RF 1 score on validate: 0.5
CV RF 2 score on validate: 0.5
CV RF 3 score on validate: 0.5416666666666666


### Lemmatized text prep for both vectorizers score highest, but TF IDF outscores CV. Significant drop off for both vectorizers on all text preparations, though. Random Forest model using TF IDF vectorizer on lemmatized text moves on to test data.

In [24]:
# Transform TF IDF vectorizer on test 
tf_lem2_bow_test = tfidf_lem2.transform(X_lem_test.lemmatized)

# Output model's score on test
print(f'TF IDF RF 3 score on validate: {tf_rf3.score(tf_lem2_bow_test, y_lem_test)}')


TF IDF RF 3 score on validate: 0.45


### Random Forest model (100 estimators, max depth 15) using TD IDF vectorizer on lemmatized text is 45% accurate on unseen data. (10% better than Decision Tree tested on previous version of data; on current version Random Forest and Decision Tree model both score 45%.)

### Naive Bayes models, alpha 0.5

In [27]:
from sklearn.naive_bayes import CategoricalNB

# Make vectorizer objects
cv_clean3 = CountVectorizer()
tfidf_clean3 = TfidfVectorizer()

#Fit vectorizer objects
cv_clean3_bow = cv_clean3.fit_transform(X_clean_train.clean)
tf_clean3_bow = tfidf_clean3.fit_transform(X_clean_train.clean)

# Make and fit Naive Bayes object for cv_clean3_bow
cv_nb1 = CategoricalNB(alpha=0.5)
cv_nb1.fit(cv_clean3_bow.toarray(), y_clean_train)

#Make and fit Naive Bayes object for tf_clean3_bow
tf_nb1 = CategoricalNB(alpha=0.5)
tf_nb1.fit(tf_clean3_bow.toarray(), y_clean_train) # Naive Bayes requires dense data

#Output NB scores
print(f'CV NB 1 score: {cv_nb1.score(cv_clean3_bow.toarray(), y_clean_train)}')
print(f'TF IDF NB 1 score: {tf_nb1.score(tf_clean3_bow.toarray(), y_clean_train)}')

# Make vectorizer objects
cv_stem3 = CountVectorizer()
tfidf_stem3 = TfidfVectorizer()

#Fit vectorizer objects
cv_stem3_bow = cv_stem3.fit_transform(X_stem_train.stemmed)
tf_stem3_bow = tfidf_stem3.fit_transform(X_stem_train.stemmed)

# Make and fit Naive Bayes object for cv_stem3_bow
cv_nb2 = CategoricalNB(alpha=0.5)
cv_nb2.fit(cv_stem3_bow.toarray(), y_stem_train)

#Make and fit Naive Bayes object for tf_stem3_bow
tf_nb2 = CategoricalNB(alpha=0.5)
tf_nb2.fit(tf_stem3_bow.toarray(), y_stem_train)

#Output NB scores
print(f'CV NB 2 score: {cv_nb2.score(cv_stem3_bow.toarray(), y_stem_train)}')
print(f'TF IDF NB 2 score: {tf_nb2.score(tf_stem3_bow.toarray(), y_stem_train)}')
# Make vectorizer objects
cv_lem3 = CountVectorizer()
tfidf_lem3 = TfidfVectorizer()

#Fit vectorizer objects
cv_lem3_bow = cv_lem3.fit_transform(X_lem_train.lemmatized)
tf_lem3_bow = tfidf_lem3.fit_transform(X_lem_train.lemmatized)

# Make and fit Naive Bayes object for cv_lem3_bow
cv_nb3 = CategoricalNB(alpha=0.5)
cv_nb3.fit(cv_lem3_bow.toarray(), y_lem_train)

#Make and fit Naive Bayes object for tf_lem3_bow
tf_nb3 = CategoricalNB(alpha=0.5)
tf_nb3.fit(tf_lem3_bow.toarray(), y_lem_train)

#Output NB scores
print(f'CV NB 3 score: {cv_nb3.score(cv_lem3_bow.toarray(), y_lem_train)}')
print(f'TF IDF NB 3 score: {tf_nb3.score(tf_lem3_bow.toarray(), y_lem_train)}')


CV NB 1 score: 0.75
TF IDF NB 1 score: 0.5357142857142857
CV NB 2 score: 0.75
TF IDF NB 2 score: 0.5357142857142857
CV NB 3 score: 0.75
TF IDF NB 3 score: 0.5357142857142857


### Naive Bayes models score significantly lower on training data. Equal scores across text preparations. Count Vectorizer beats TF/IDF.

In [34]:
from pprint import pprint

# Transform count vectorizers on validate
cv_clean3_bow_val = cv_clean3.transform(X_clean_validate.clean)
cv_stem3_bow_val = cv_stem3.transform(X_stem_validate.stemmed)
cv_lem3_bow_val = cv_lem3.transform(X_lem_validate.lemmatized)

pprint(cv_clean3_bow_val.todense()[10])

# Output CV NB scores
#print(f'CV NB 1 score on validate: {cv_nb1.score(cv_clean3_bow_val.todense(), y_clean_validate)}')
#print(f'CV NB 2 score on validate: {cv_nb2.score(cv_stem3_bow_val.todense(), y_stem_validate)}')
#print(f'CV NB 3 score on validate: {cv_nb3.score(cv_lem3_bow_val.todense(), y_lem_validate)}')

matrix([[0, 0, 0, ..., 0, 0, 0]])


In [None]:
def get_cv_dec_tree(text_data, target, depth):
    cv = CountVectorizer()
    bow = cv.fit_transform(text_data)

    
    
