In [27]:
import pandas as pd

from prepare import basic_body_clean, tokenize, remove_stopwords, lemmatize, basic_code_clean
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

# ACQUIRE, PREP AND SPLIT DATA

In [16]:
gitMDs = pd.read_json('gitMDsv2.json')

gitMDs['readme'] = gitMDs['body'].apply(basic_body_clean).apply(tokenize).apply(remove_stopwords).apply(lemmatize)
gitMDs['language'] = gitMDs['top_code'].apply(basic_code_clean).apply(tokenize).apply(remove_stopwords)
gitMDs[['language', 'percentage']] = gitMDs['language'].str.split(" ",expand=True)
gitMDs['percentage'] = pd.to_numeric(gitMDs['percentage'])

gitMDs.drop(columns = ['body', 'top_code', 'percentage'], inplace = True)

In [17]:
gitMDs.head()

Unnamed: 0,readme,language
0,freecodecamporg ' opensource codebase curricul...,javascript
1,supporting vuejs vuejs mitlicensed open source...,javascript
2,react react javascript library building user i...,javascript
3,bootstrap sleek intuitive powerful frontend fr...,javascript
4,airbnb javascript style guide mostly reasonabl...,javascript


In [18]:
train_validate, test = train_test_split(gitMDs[['language', 'readme']], 
                                        stratify=gitMDs.language, 
                                        test_size=.2)

train, validate = train_test_split(train_validate, 
                                   stratify=train_validate.language, 
                                   test_size=.25)

In [19]:
train.language.value_counts()

python        54
javascript    54
Name: language, dtype: int64

# BASELINE

In [25]:
train['baseline_prediction'] = 'python'

print(classification_report(train.language, train.baseline_prediction))

              precision    recall  f1-score   support

  javascript       0.00      0.00      0.00        54
      python       0.50      1.00      0.67        54

    accuracy                           0.50       108
   macro avg       0.25      0.50      0.33       108
weighted avg       0.25      0.50      0.33       108



# BAG OF WORDS
***

## TRAIN

In [None]:
# creating vectorizer
vectorizer = CountVectorizer(stop_words='english', 
                             min_df=20, 
                             ngram_range=(1,2), 
                             binary=True)

# Learn vocabulary in sentences. 
vectorizer.fit(train.readme)

# Get dictionary. 
vectorizer.get_feature_names()

# Transform each sentences in vector space.
bow = vectorizer.transform(train.readme)

X_bow = bow

In [30]:
# create our y dataset
y = train['language']

lm = LogisticRegression().fit(X_bow, y)

train['predicted'] = lm.predict(X_bow)

In [31]:
print(classification_report(train.language, train.predicted))

              precision    recall  f1-score   support

  javascript       1.00      1.00      1.00        54
      python       1.00      1.00      1.00        54

    accuracy                           1.00       108
   macro avg       1.00      1.00      1.00       108
weighted avg       1.00      1.00      1.00       108



## VALIDATE

In [34]:
# Transform each sentences in vector space.
v_bow = vectorizer.transform(validate.readme)

validate['predicted'] = lm.predict(v_bow)

print(classification_report(validate.language, validate.predicted))

              precision    recall  f1-score   support

  javascript       0.88      0.83      0.86        18
      python       0.84      0.89      0.86        18

    accuracy                           0.86        36
   macro avg       0.86      0.86      0.86        36
weighted avg       0.86      0.86      0.86        36



## TEST

In [35]:
# Transform each sentences in vector space.
v_bow = vectorizer.transform(test.readme)

test['predicted'] = lm.predict(v_bow)

print(classification_report(test.language, test.predicted))

              precision    recall  f1-score   support

  javascript       0.71      0.67      0.69        18
      python       0.68      0.72      0.70        18

    accuracy                           0.69        36
   macro avg       0.70      0.69      0.69        36
weighted avg       0.70      0.69      0.69        36



# TFIDF
***

# TRAIN

In [38]:
tfidf = TfidfVectorizer(stop_words='english', min_df = 20,
                             ngram_range=(1,2), 
                             binary=True)

tfidf_sparse_matrix = tfidf.fit_transform(train.readme)

tfidf.vocabulary_

# Transform to document-term matrix
vector_spaces = tfidf.transform(train.readme)

X_tfidf = tfidf_sparse_matrix

In [39]:
lm_tfidf = LogisticRegression().fit(X_tfidf, y)

train['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

print(classification_report(train.language, train.pred_tfidf))

              precision    recall  f1-score   support

  javascript       1.00      0.98      0.99        54
      python       0.98      1.00      0.99        54

    accuracy                           0.99       108
   macro avg       0.99      0.99      0.99       108
weighted avg       0.99      0.99      0.99       108



# VALIDATE

In [40]:
tfidf_sparse_matrix = tfidf.transform(validate.readme)

# Transform to document-term matrix
vector_spaces = tfidf.transform(validate.readme)

X_tfidf = tfidf_sparse_matrix

validate['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

print(classification_report(validate.language, validate.pred_tfidf))

              precision    recall  f1-score   support

  javascript       0.94      0.83      0.88        18
      python       0.85      0.94      0.89        18

    accuracy                           0.89        36
   macro avg       0.89      0.89      0.89        36
weighted avg       0.89      0.89      0.89        36



# TEST

In [41]:
tfidf_sparse_matrix = tfidf.transform(test.readme)

# Transform to document-term matrix
vector_spaces = tfidf.transform(test.readme)

X_tfidf = tfidf_sparse_matrix

test['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

print(classification_report(test.language, test.pred_tfidf))

              precision    recall  f1-score   support

  javascript       0.71      0.67      0.69        18
      python       0.68      0.72      0.70        18

    accuracy                           0.69        36
   macro avg       0.70      0.69      0.69        36
weighted avg       0.70      0.69      0.69        36

