In [1]:
import re, os
import unicodedata
import json

import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
import nltk.sentiment
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from time import strftime

from wordcloud import WordCloud

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer


from requests import get
from bs4 import BeautifulSoup

from wordcloud import WordCloud

import acquire
import acquire_jg
import prepare_jag


plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

pd.set_option('display.max_rows', 200)


In [2]:
#Acquire and prep
df = pd.read_json('data.json')
df = prepare_jag.prep_article_data(df, 'original', extra_words=['&#9;', "'", '1', '0', 'use', 'file', 'build', 'test', 'code'])

Make word and bigram lists

In [3]:
#Make word lists by targets from lemmatized words
swift_words = ' '.join(df[df.target == 'swift'].lemmatized).split()
python_words = ' '.join(df[df.target == 'python'].lemmatized).split()
c_words = ' '.join(df[df.target == 'c'].lemmatized).split()
other_words = ' '.join(df[df.target == 'other'].lemmatized).split()
all_words = ' '.join(df.lemmatized).split()

In [4]:
print('Swift words: '+str(len(swift_words)))
print('Python words: '+str(len(python_words)))
print('C words: '+str(len(c_words)))
print('Other words: '+str(len(other_words)))
print('All words: '+str(len(all_words)))

Swift words: 29163
Python words: 15767
C words: 7681
Other words: 4290
All words: 56901


In [5]:
# Look at word frequency per languge
swift_freq = pd.Series(swift_words).value_counts()
python_freq = pd.Series(python_words).value_counts()
c_freq = pd.Series(c_words).value_counts()
other_freq = pd.Series(other_words).value_counts()
all_freq = pd.Series(all_words).value_counts()

In [6]:
# Look at 20 most common words
word_counts = (pd.concat([all_freq, swift_freq, python_freq, c_freq, other_freq], axis = 1, sort = True)
                .set_axis(['all', 'swift', 'python', 'c', 'other'], axis = 1, inplace = False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))
word_counts.sort_values(by = 'all', ascending = False).head(20)

Unnamed: 0,all,swift,python,c,other
swift,981,783,40,122,36
package,383,302,21,47,13
project,371,218,68,49,36
using,324,183,77,45,19
data,273,97,122,30,24
library,272,207,20,40,5
run,271,140,81,26,24
example,228,117,70,27,14
used,219,86,87,33,13
version,218,134,42,28,14


In [7]:
# Define bigrams 
swift_bigrams = (pd.Series(nltk.ngrams(swift_words, 2))
                      .value_counts()
                      .head(20))

python_bigrams = (pd.Series(nltk.ngrams(python_words, 2))
                      .value_counts()
                      .head(20))

c_bigrams = (pd.Series(nltk.ngrams(c_words, 2))
                      .value_counts()
                      .head(20))

other_bigrams = (pd.Series(nltk.ngrams(other_words, 2))
                      .value_counts()
                      .head(20))



---

### Establish baseline

The most common language is swift, so our baseline model would predict that all the repositories are coded in swift

In [8]:
# Establish baseline 
# baseline
df[df['language'] == 'Swift'].language.value_counts()/sum(df.language.value_counts())

Swift    0.46
Name: language, dtype: float64

---

### Split Data

- Need to split words in to train / test / split
- For x - start with vectorzed,  lematized words 
- y = target

In [9]:
# Split X Y
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.lemmatized)
y = df.target

In [10]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)


In [11]:
#Create evaluation dataframe
train_eval = pd.DataFrame(dict(actual=y_train))

In [13]:
from sklearn.naive_bayes import GaussianNB

In [18]:
X_train = X_train.toarray()

In [24]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

train_eval['gnb_predicted'] = gnb.predict(X_train)


In [31]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_eval.actual, train_eval.gnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train_eval.gnb_predicted, train_eval.actual))
print('---')
print(classification_report(train_eval.actual, train_eval.gnb_predicted))

Accuracy: 97.50%
---
Confusion Matrix
actual          c  other  python  swift
gnb_predicted                          
c              18      0       0      0
other           3     14       0      0
python          0      0      30      0
swift           0      0       0     55
---
              precision    recall  f1-score   support

           c       1.00      0.86      0.92        21
       other       0.82      1.00      0.90        14
      python       1.00      1.00      1.00        30
       swift       1.00      1.00      1.00        55

    accuracy                           0.97       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.98      0.97      0.98       120



In [36]:
test_eval=pd.DataFrame()

In [42]:
type(y_test)

pandas.core.series.Series

In [43]:
test_eval['actual'] = y_test.copy()

In [44]:
test_eval.head()

Unnamed: 0,actual
64,swift
146,other
137,c
106,swift
89,swift


In [47]:
X_test = X_test.toarray()

In [48]:
test_eval['predicted'] = gnb.predict(X_test)

In [49]:
print('Accuracy: {:.2%}'.format(accuracy_score(test_eval.actual, test_eval.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test_eval.predicted, test_eval.actual))
print('---')
print(classification_report(test_eval.actual, test_eval.predicted))

Accuracy: 77.42%
---
Confusion Matrix
actual     c  other  python  swift
predicted                         
c          2      1       0      1
other      1      2       0      0
python     0      0       7      0
swift      3      0       1     13
---
              precision    recall  f1-score   support

           c       0.50      0.33      0.40         6
       other       0.67      0.67      0.67         3
      python       1.00      0.88      0.93         8
       swift       0.76      0.93      0.84        14

    accuracy                           0.77        31
   macro avg       0.73      0.70      0.71        31
weighted avg       0.76      0.77      0.76        31



In [32]:
# Multinomial naive bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

train_eval['mnb_predicted'] = mnb.predict(X_train)


In [33]:
train_eval.head()

Unnamed: 0,actual,predicted,gnb_predicted,mnb_predicted
87,c,c,c,swift
93,swift,swift,swift,swift
33,other,other,other,swift
141,python,python,python,swift
123,other,other,other,swift


In [34]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_eval.actual, train_eval.mnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train_eval.mnb_predicted, train_eval.actual))
print('---')
print(classification_report(train_eval.actual, train_eval.mnb_predicted))

Accuracy: 67.50%
---
Confusion Matrix
actual          c  other  python  swift
mnb_predicted                          
python          0      0      26      0
swift          21     14       4     55
---
              precision    recall  f1-score   support

           c       0.00      0.00      0.00        21
       other       0.00      0.00      0.00        14
      python       1.00      0.87      0.93        30
       swift       0.59      1.00      0.74        55

    accuracy                           0.68       120
   macro avg       0.40      0.47      0.42       120
weighted avg       0.52      0.68      0.57       120



  _warn_prf(average, modifier, msg_start, len(result))


Want to use lazy classifier, but getting install warnings from XG boost. Had problems installing it before with the M1 chip. I am going to create a new environment and see if I cant get it to work.


In [12]:
#from lazypredict.Supervised import LazyClassifier



XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ["dlopen(/Users/jaredgodar/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/jaredgodar/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)"]


Consider naive bayes models - in sklearn good for ; GaussianNB, Multinomial Naive Bayes, LinearSVC