In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import requests
from bs4 import BeautifulSoup

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import nltk.sentiment

from wordcloud import WordCloud
pd.set_option('display.max_colwidth', -1)
import re
from time import strftime

import unicodedata
import json
from pprint import pprint

# Turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


import acquire 
import prepare
import model

  pd.set_option('display.max_colwidth', -1)


### Modeling Notes:
    - Modelling was run using TF-IDF & Bag Of Words on Logistic Regression, Decision Tree, & Random Forest

    - Baseline was created by defining the most common language frequency Python:
        - 53.17%
    
    - Bag of Words Decision Tree (6=depth) ran the best at:
        Train Accuracy : 92.86%
        Validate Accuracy: 63.33%
        
    - Random Forest ran the best accuracy on training data, but failed to perform on the validate set.  It classified everything as Python which made it predict exactly the same as baseline.  For this reason we decided not to use this to model with. 
    
    - Reference model_results.ipynb to see all models performed

In [2]:
### Stephanie: the modeling cells following this should run once we have already train,validate,test split
### within the notebook


#pull csv
df = pd.read_csv('NLP.csv')
#cleaning
df = prepare.filter_data(df)
#splitting
train, validate, test = prepare.split_data(df)
###


In [3]:
# Count Vectorizer using bag of words - defining X & y variables
#pull csv
df = pd.read_csv('NLP.csv')
#cleaning
df = prepare.filter_data(df)
#splitting
train, validate, test = prepare.split_data(df)
#Using CountVectorizer for Bag of Words - defining X & y(target variable)
cv = CountVectorizer()

### note if you want to add more features add onto next line, make sure the column is on train and entered as a list
### remove this comment for final notebook if we don't use
X = cv.fit_transform(train.lemmatized)
y = train.language
# function as above
X_train, y_train, X_validate, y_validate, X_test, y_test = model.X_train_split(X, y)

In [4]:
model.baseline_accuracy()

Baseline Accuracy: 53.17%


In [5]:
model.BoW_Decision_tree(X_train, y_train, X_validate, y_validate, X_test, y_test)

Accuracy: 94.29%
---
Confusion Matrix
actual          C#  Java  JavaScript  Python  TypeScript
tree_predicted                                          
C#              4   0     0           1       0         
Java            0   4     0           0       0         
JavaScript      0   0     15          0       0         
Python          0   2     0           36      0         
TypeScript      1   0     0           0       7         
---
              precision    recall  f1-score   support

          C#       0.80      0.80      0.80         5
        Java       1.00      0.67      0.80         6
  JavaScript       1.00      1.00      1.00        15
      Python       0.95      0.97      0.96        37
  TypeScript       0.88      1.00      0.93         7

    accuracy                           0.94        70
   macro avg       0.92      0.89      0.90        70
weighted avg       0.95      0.94      0.94        70

----------------------------------------------
Validate Accuracy: 70.0

In [6]:
## Pull for final notebook, run last thing as this is our final test


##model.test_model(X_train, y_train, X_validate, y_validate, X_test, y_test)

We can add the test conclusions after we get the results for the final notebook

In [27]:
######### Stephanie add all the above lines for the final notebook

In [7]:
#pull csv
df = pd.read_csv('NLP.csv')

#cleaning
df = df.reset_index().drop(columns = 'index')
df = prepare.filter_data(df)

### dropping these columns post cleaning as we will be focusing on lemmatized
#columns_drop = ['readme_contents', 'clean', 'stemmed']
#df = df.drop(columns_drop, 1)

train, validate, test = prepare.split_data(df)

#double checking
print ('train ===>', train.shape)
print ('validate ===>', validate.shape)
print('test===>', test.shape)

train ===> (126, 7)
validate ===> (54, 7)
test===> (45, 7)


In [8]:
### Value Counts for our train dataset

train.language.value_counts()

Python        67
JavaScript    27
TypeScript    12
Java          11
C#            9 
Name: language, dtype: int64

In [9]:
### function for further splitting has stratify

def X_train_split(X_data, y_data):
    ''' Further splitting for X & y train,validate,test
    '''
    X_train_validate, X_test, y_train_validate, y_test = train_test_split(X_data, y_data, 
                                                                          stratify = y_data, 
                                                                          test_size=.2, random_state=123)
    
    X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, 
                                                                stratify = y_train_validate, 
                                                                test_size=.3, 
                                                                random_state=123)
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

In [10]:
#defining baseline accuracy by most common language frequency = Python
Baseline_Accuracy = round(max(train.language.value_counts()) / train.shape[0] *100,2)

print(f'Baseline Accuracy: {round(max(train.language.value_counts()) / train.shape[0] *100,2)}%')


Baseline Accuracy: 53.17%


In [11]:
# definig TfidVectorizer for X and y
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(train.lemmatized)
y = train.language

In [12]:
# splitting X_train_split
X_train, y_train, X_validate, y_validate, X_test, y_test = X_train_split(X, y)

### Logistic Regression TF-IDF

In [13]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

#form predictions
train['predicted'] = lm.predict(X_train)
validate['predicted'] = lm.predict(X_validate)
test['predicted'] = lm.predict(X_test)
print('Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))
print('----------------------------------------------')
print('Validate Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

Train Accuracy: 71.43%
---
Confusion Matrix
actual      C#  Java  JavaScript  Python  TypeScript
predicted                                           
JavaScript  0   0     13          0       0         
Python      5   6     2           37      7         
---
              precision    recall  f1-score   support

          C#       0.00      0.00      0.00         5
        Java       0.00      0.00      0.00         6
  JavaScript       1.00      0.87      0.93        15
      Python       0.65      1.00      0.79        37
  TypeScript       0.00      0.00      0.00         7

    accuracy                           0.71        70
   macro avg       0.33      0.37      0.34        70
weighted avg       0.56      0.71      0.62        70

----------------------------------------------
Validate Accuracy: 53.33%
---
Confusion Matrix
actual     C#  Java  JavaScript  Python  TypeScript
predicted                                          
Python     2   3     6           16      3         
-

### Decision Tree TF-IDF (depth = 5)

In [14]:

train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))


tree = DecisionTreeClassifier(max_depth=4).fit(X_train, y_train)
train['tree_predicted'] = tree.predict(X_train)
validate['tree_predicted'] = tree.predict(X_validate)
test['tree_predicted'] = tree.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.tree_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.tree_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.tree_predicted))
print('----------------------------------------------')
print('Validate Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.tree_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.tree_predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.tree_predicted))

Accuracy: 90.00%
---
Confusion Matrix
actual          C#  Java  JavaScript  Python  TypeScript
tree_predicted                                          
C#              4   1     0           1       0         
Java            0   4     0           0       0         
JavaScript      0   0     14          0       0         
Python          1   1     0           36      2         
TypeScript      0   0     1           0       5         
---
              precision    recall  f1-score   support

          C#       0.67      0.80      0.73         5
        Java       1.00      0.67      0.80         6
  JavaScript       1.00      0.93      0.97        15
      Python       0.90      0.97      0.94        37
  TypeScript       0.83      0.71      0.77         7

    accuracy                           0.90        70
   macro avg       0.88      0.82      0.84        70
weighted avg       0.91      0.90      0.90        70

----------------------------------------------
Validate Accuracy: 56.6

### Random Forest TF-IDF

In [15]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))


forest = RandomForestClassifier(min_samples_leaf = 1, max_depth = 5, random_state= 123).fit(X_train, y_train)



train['forest_predicted'] = forest.predict(X_train)
validate['forest_predicted'] = forest.predict(X_validate)
test['forest_predicted'] = forest.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.forest_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.forest_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.forest_predicted))
print('----------------------------------------------')
print('Validate Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.forest_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.forest_predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.forest_predicted))

Accuracy: 77.14%
---
Confusion Matrix
actual            C#  Java  JavaScript  Python  TypeScript
forest_predicted                                          
C#                4   0     0           0       0         
Java              0   2     0           0       0         
JavaScript        0   0     9           0       0         
Python            1   4     6           37      5         
TypeScript        0   0     0           0       2         
---
              precision    recall  f1-score   support

          C#       1.00      0.80      0.89         5
        Java       1.00      0.33      0.50         6
  JavaScript       1.00      0.60      0.75        15
      Python       0.70      1.00      0.82        37
  TypeScript       1.00      0.29      0.44         7

    accuracy                           0.77        70
   macro avg       0.94      0.60      0.68        70
weighted avg       0.84      0.77      0.75        70

----------------------------------------------
Validate 

# Implementing Bag Of Words

In [16]:
#pull csv
df = pd.read_csv('NLP.csv')

#cleaning
df = df.reset_index().drop(columns = 'index')
df = prepare.filter_data(df)


### dropping these columns & splitting data
columns_drop = ['readme_contents', 'clean', 'stemmed']
df = df.drop(columns_drop, 1)
train, validate, test = prepare.split_data(df)
###

# Count Vectorizer using bag of words - defining X & y variables
cv = CountVectorizer()
X = cv.fit_transform(train.lemmatized)
y = train.language

# function as above
X_train, y_train, X_validate, y_validate, X_test, y_test = X_train_split(X, y)

In [17]:
X_train.shape

(70, 8257)

In [18]:
X_validate.shape

(30, 8257)

In [19]:
X_test.shape

(26, 8257)

In [20]:
y_validate.shape

(30,)

In [21]:
#X_bag_of_words.todense()

In [22]:
cv.vocabulary_

{'discord': 2398,
 'bot': 1232,
 'github': 3224,
 'version': 7861,
 '100': 36,
 'display': 2436,
 'notif': 5245,
 'channel': 1642,
 'select': 6474,
 'event': 2765,
 'trigger': 7543,
 'utilis': 7808,
 'webhook': 7976,
 'secret': 6462,
 'secur': 6467,
 'updat': 7700,
 'privat': 5759,
 'public': 5834,
 'repositori': 6197,
 'setup': 6575,
 'add': 556,
 'point': 5646,
 'httpsdiscordbotgithubherokuappcom': 3712,
 'would': 8137,
 'like': 4586,
 'listen': 4616,
 'visit': 7915,
 'githubcom': 3228,
 'click': 1754,
 'set': 6545,
 'tab': 7211,
 'content': 1983,
 'type': 7615,
 'applicationjson': 842,
 'authoris': 959,
 'server': 6512,
 'linkhttpsdiscordcomapioauth2authorizeclient_id193000403632128013permissions18432scopebot20applicationscommand': 4600,
 'activ': 542,
 'use': 7742,
 'slash': 6689,
 'command': 1846,
 'list': 4614,
 'specifi': 6796,
 'valu': 7837,
 'subscrib': 7156,
 'receiv': 6074,
 'keep': 4428,
 'mind': 4892,
 'store': 7080,
 'plain': 5619,
 'text': 7316,
 'somewher': 6766,
 'dont

### Logistic Regression Bag of Words

In [23]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

#form predictions
train['predicted'] = lm.predict(X_train)
validate['predicted'] = lm.predict(X_validate)
test['predicted'] = lm.predict(X_test)
print('Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))
print('----------------------------------------------')
print('Validate Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

Train Accuracy: 100.00%
---
Confusion Matrix
actual      C#  Java  JavaScript  Python  TypeScript
predicted                                           
C#          5   0     0           0       0         
Java        0   6     0           0       0         
JavaScript  0   0     15          0       0         
Python      0   0     0           37      0         
TypeScript  0   0     0           0       7         
---
              precision    recall  f1-score   support

          C#       1.00      1.00      1.00         5
        Java       1.00      1.00      1.00         6
  JavaScript       1.00      1.00      1.00        15
      Python       1.00      1.00      1.00        37
  TypeScript       1.00      1.00      1.00         7

    accuracy                           1.00        70
   macro avg       1.00      1.00      1.00        70
weighted avg       1.00      1.00      1.00        70

----------------------------------------------
Validate Accuracy: 53.33%
---
Confusion Matr

### Bag of Words Decision Tree (depth = 5)

In [24]:

train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))


tree = DecisionTreeClassifier(max_depth=5).fit(X_train, y_train)
train['tree_predicted'] = tree.predict(X_train)
validate['tree_predicted'] = tree.predict(X_validate)
test['tree_predicted'] = tree.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.tree_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.tree_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.tree_predicted))
print('----------------------------------------------')
print('Validate Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.tree_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.tree_predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.tree_predicted))

Accuracy: 85.71%
---
Confusion Matrix
actual          C#  Java  JavaScript  Python  TypeScript
tree_predicted                                          
C#              2   0     0           0       0         
Java            0   4     0           0       0         
JavaScript      3   0     15          1       2         
Python          0   2     0           36      2         
TypeScript      0   0     0           0       3         
---
              precision    recall  f1-score   support

          C#       1.00      0.40      0.57         5
        Java       1.00      0.67      0.80         6
  JavaScript       0.71      1.00      0.83        15
      Python       0.90      0.97      0.94        37
  TypeScript       1.00      0.43      0.60         7

    accuracy                           0.86        70
   macro avg       0.92      0.69      0.75        70
weighted avg       0.89      0.86      0.84        70

----------------------------------------------
Validate Accuracy: 63.3

### Random Forest Bag of Words
    - play with max_depth

In [25]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))


forest = RandomForestClassifier(min_samples_leaf = 1, max_depth = 6, random_state= 123).fit(X_train, y_train)



train['forest_predicted'] = forest.predict(X_train)
validate['forest_predicted'] = forest.predict(X_validate)
test['forest_predicted'] = forest.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.forest_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.forest_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.forest_predicted))
print('----------------------------------------------')
print('Validate Performance: {:.2%}'.format(accuracy_score(validate.actual, validate.forest_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.forest_predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.forest_predicted))

Accuracy: 78.57%
---
Confusion Matrix
actual            C#  Java  JavaScript  Python  TypeScript
forest_predicted                                          
C#                4   0     0           0       0         
Java              0   2     0           0       0         
JavaScript        0   0     10          0       0         
Python            1   4     5           37      5         
TypeScript        0   0     0           0       2         
---
              precision    recall  f1-score   support

          C#       1.00      0.80      0.89         5
        Java       1.00      0.33      0.50         6
  JavaScript       1.00      0.67      0.80        15
      Python       0.71      1.00      0.83        37
  TypeScript       1.00      0.29      0.44         7

    accuracy                           0.79        70
   macro avg       0.94      0.62      0.69        70
weighted avg       0.85      0.79      0.76        70

----------------------------------------------
Validate 

In [26]:
validate.actual.value_counts()

Python        16
JavaScript    6 
Java          3 
TypeScript    3 
C#            2 
Name: actual, dtype: int64