In [1]:
import pandas as pd
import re
import numpy as np
import nltk

from prepare import basic_body_clean, tokenize, remove_stopwords, lemmatize, basic_code_clean
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

# ACQUIRE, PREP AND SPLIT DATA

In [2]:
gitMDs = pd.read_json('gitMDsv2.json')

gitMDs['readme'] = gitMDs['body'].apply(basic_body_clean).apply(tokenize).apply(remove_stopwords).apply(lemmatize)
gitMDs['language'] = gitMDs['top_code'].apply(basic_code_clean).apply(tokenize).apply(remove_stopwords)
gitMDs[['language', 'percentage']] = gitMDs['language'].str.split(" ",expand=True)
gitMDs['percentage'] = pd.to_numeric(gitMDs['percentage'])

gitMDs.drop(columns = ['body', 'top_code', 'percentage'], inplace = True)

In [3]:
gitMDs.head()

Unnamed: 0,readme,language
0,freecodecamporg ' opensource codebase curricul...,javascript
1,supporting vuejs vuejs mitlicensed open source...,javascript
2,react react javascript library building user i...,javascript
3,bootstrap sleek intuitive powerful frontend fr...,javascript
4,airbnb javascript style guide mostly reasonabl...,javascript


In [4]:
train_validate, test = train_test_split(gitMDs[['language', 'readme']], 
                                        stratify=gitMDs.language, 
                                        test_size=.2)

train, validate = train_test_split(train_validate, 
                                   stratify=train_validate.language, 
                                   test_size=.25)

In [5]:
train.language.value_counts()

python        54
javascript    54
Name: language, dtype: int64

# SEPARATING LANGUAGES

In [6]:
# function to convert df rows to list of words
def rows_to_lists(text):
    words = re.sub(r'[^\w\s]', '', text).split()
    return words

In [7]:
# separating languages and converting rows to lists (1 list per language)
javascript = rows_to_lists(' '.join(train[train.language == 'javascript'].readme))
python = rows_to_lists(' '.join(train[train.language == 'python'].readme))

# BIGRAM FEATURE

In [8]:
# getting value counts of bigrams
# ie. how often did they appear in data
js_bigrams = pd.Series(nltk.ngrams(javascript, 2)).value_counts()

# creating top 10 series of most common spam bigrams
top_10_js_bigrams = js_bigrams.head(10)

# checking top 10
top_10_js_bigrams

(react, native)       764
(back, top)           117
(io, android)          75
(bad, const)           74
(9, 9)                 69
(good, const)          67
(component, react)     56
(bad, function)        51
(android, io)          50
(good, function)       48
dtype: int64

In [9]:
# getting value counts of bigrams
# ie. how often did they appear in data
py_bigrams = pd.Series(nltk.ngrams(python, 2)).value_counts()

# creating top 10 series of most common spam bigrams
top_10_py_bigrams = py_bigrams.head(5)

# checking top 10
top_10_py_bigrams

(generative, adversarial)    219
(adversarial, network)       204
(pip, install)                64
(1, 2)                        48
(api, gateway)                45
dtype: int64

In [10]:
pd.Series(nltk.ngrams(python, 2))

0                 (mkdocs, project)
1          (project, documentation)
2         (documentation, markdown)
3                  (markdown, view)
4                    (view, mkdocs)
                    ...            
44749                (started, jax)
44750              (jax, developer)
44751              (developer, see)
44752              (see, developer)
44753    (developer, documentation)
Length: 44754, dtype: object

In [11]:
top_10_py_bigrams.index.tolist()

[('generative', 'adversarial'),
 ('adversarial', 'network'),
 ('pip', 'install'),
 ('1', '2'),
 ('api', 'gateway')]

# SINGLE WORD FEATURE (JS)

In [12]:
# how often did each word appear in data?
js_words = pd.Series(nltk.ngrams(javascript, 1)).value_counts()

top_5_js_words = js_words.head(5)

t5jsl = top_5_js_words.index.tolist()

In [13]:
t5jsreduced = []

for x in t5jsl:
    t5jsreduced.append(x[0])
    
t5jsreduced

['react', 'native', 'javascript', 'function', 'use']

In [14]:
contains = [train['readme'].str.contains(w) for w in t5jsreduced]

t5_js_index = train[np.any(contains, axis = 0)].index.tolist()

train['has_top_5_js_word'] = train.index.isin(t5_js_index)

train

Unnamed: 0,language,readme,has_top_5_js_word
32,javascript,lodash site doc fp guide contributing wiki cod...,True
35,javascript,momentjs javascript date library parsing valid...,True
51,javascript,feature blazing fast bundle time multicore com...,True
89,javascript,slick last carousel ' ever need demo httpkenwh...,True
135,python,mkdocs project documentation markdown view mkd...,True
...,...,...,...
90,python,nginxproxy set container running nginx dockerg...,True
177,python,fast style transfer tensorflow add style famou...,True
52,javascript,introduction adminlte fully responsive adminis...,True
154,python,jax autograd xla quickstart transformation ins...,True


# SINGLE WORD FEATURE (PY)

In [15]:
# how often did each word appear in data?
py_words = pd.Series(nltk.ngrams(python, 1)).value_counts()

top_5_py_words = py_words.head(5)

t5pyl = top_5_py_words.index.tolist()

In [16]:
t5pyreduced = []

for x in t5pyl:
    t5pyreduced.append(x[0])
    
t5pyreduced

['python', 'network', 'image', 'adversarial', 'use']

In [17]:
contains = [train['readme'].str.contains(w) for w in t5pyreduced]

t5_py_index = train[np.any(contains, axis = 0)].index.tolist()

train['has_top_5_py_word'] = train.index.isin(t5_py_index)

train

Unnamed: 0,language,readme,has_top_5_js_word,has_top_5_py_word
32,javascript,lodash site doc fp guide contributing wiki cod...,True,False
35,javascript,momentjs javascript date library parsing valid...,True,False
51,javascript,feature blazing fast bundle time multicore com...,True,True
89,javascript,slick last carousel ' ever need demo httpkenwh...,True,True
135,python,mkdocs project documentation markdown view mkd...,True,True
...,...,...,...,...
90,python,nginxproxy set container running nginx dockerg...,True,True
177,python,fast style transfer tensorflow add style famou...,True,True
52,javascript,introduction adminlte fully responsive adminis...,True,True
154,python,jax autograd xla quickstart transformation ins...,True,True


# SINGLE WORD MODEL

In [19]:
y_train = train['language']

X_train = train[['has_top_5_js_word', 'has_top_5_py_word']]

In [20]:
# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(random_state=123)

logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
train['predicted'] = logit.predict(X_train)

In [22]:
print(classification_report(train.language, train.predicted))

              precision    recall  f1-score   support

  javascript       0.89      0.15      0.25        54
      python       0.54      0.98      0.69        54

    accuracy                           0.56       108
   macro avg       0.71      0.56      0.47       108
weighted avg       0.71      0.56      0.47       108



# BASELINE

In [23]:
train['baseline_prediction'] = 'python'

print(classification_report(train.language, train.baseline_prediction))

              precision    recall  f1-score   support

  javascript       0.00      0.00      0.00        54
      python       0.50      1.00      0.67        54

    accuracy                           0.50       108
   macro avg       0.25      0.50      0.33       108
weighted avg       0.25      0.50      0.33       108



# BAG OF WORDS
***

## TRAIN

In [24]:
# creating vectorizer
vectorizer = CountVectorizer(stop_words='english', 
                             min_df=20, 
                             ngram_range=(1,2), 
                             binary=True)

# Learn vocabulary in sentences. 
vectorizer.fit(train.readme)

# Get dictionary. 
vectorizer.get_feature_names()

# Transform each sentences in vector space.
bow = vectorizer.transform(train.readme)

X_bow = bow

In [25]:
# create our y dataset
y = train['language']

lm = LogisticRegression().fit(X_bow, y)

train['predicted'] = lm.predict(X_bow)

In [26]:
print(classification_report(train.language, train.predicted))

              precision    recall  f1-score   support

  javascript       1.00      1.00      1.00        54
      python       1.00      1.00      1.00        54

    accuracy                           1.00       108
   macro avg       1.00      1.00      1.00       108
weighted avg       1.00      1.00      1.00       108



## VALIDATE

In [27]:
# Transform each sentences in vector space.
v_bow = vectorizer.transform(validate.readme)

validate['predicted'] = lm.predict(v_bow)

print(classification_report(validate.language, validate.predicted))

              precision    recall  f1-score   support

  javascript       0.94      0.83      0.88        18
      python       0.85      0.94      0.89        18

    accuracy                           0.89        36
   macro avg       0.89      0.89      0.89        36
weighted avg       0.89      0.89      0.89        36



## TEST

In [28]:
# Transform each sentences in vector space.
v_bow = vectorizer.transform(test.readme)

test['predicted'] = lm.predict(v_bow)

print(classification_report(test.language, test.predicted))

              precision    recall  f1-score   support

  javascript       0.94      0.89      0.91        18
      python       0.89      0.94      0.92        18

    accuracy                           0.92        36
   macro avg       0.92      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36



# TFIDF
***

# TRAIN

In [33]:
tfidf = TfidfVectorizer(stop_words='english', min_df = 20,
                             ngram_range=(1,2), 
                             binary=True)

tfidf_sparse_matrix = tfidf.fit_transform(train.readme)

tfidf.vocabulary_

# Transform to document-term matrix
vector_spaces = tfidf.transform(train.readme)

X_tfidf = tfidf_sparse_matrix

In [34]:
lm_tfidf = LogisticRegression().fit(X_tfidf, y)

train['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

print(classification_report(train.language, train.pred_tfidf))

              precision    recall  f1-score   support

  javascript       1.00      0.98      0.99        54
      python       0.98      1.00      0.99        54

    accuracy                           0.99       108
   macro avg       0.99      0.99      0.99       108
weighted avg       0.99      0.99      0.99       108



# VALIDATE

In [31]:
tfidf_sparse_matrix = tfidf.transform(validate.readme)

# Transform to document-term matrix
vector_spaces = tfidf.transform(validate.readme)

X_tfidf = tfidf_sparse_matrix

validate['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

print(classification_report(validate.language, validate.pred_tfidf))

              precision    recall  f1-score   support

  javascript       0.94      0.89      0.91        18
      python       0.89      0.94      0.92        18

    accuracy                           0.92        36
   macro avg       0.92      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36



# TEST

In [32]:
tfidf_sparse_matrix = tfidf.transform(test.readme)

# Transform to document-term matrix
vector_spaces = tfidf.transform(test.readme)

X_tfidf = tfidf_sparse_matrix

test['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

print(classification_report(test.language, test.pred_tfidf))

              precision    recall  f1-score   support

  javascript       0.94      0.89      0.91        18
      python       0.89      0.94      0.92        18

    accuracy                           0.92        36
   macro avg       0.92      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36

