In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from env import get_db_url
import model as m
from sklearn.feature_extraction.text import TfidfVectorizer


import warnings
warnings.filterwarnings('ignore')


import sys
# allow modules from parent directory to be imported
sys.path.append('..')
import prepare as p



In [2]:
df = p.prepare_data()

In [3]:
train, validate, test = p.split_data()

In [4]:
X_train, y_train, X_validate, y_validate, X_test, y_test = p.X_y_split()

In [5]:
X_train.head()

Unnamed: 0,repo,language,readme_contents,clean_text,stem,lemmatize
243,facebookresearch/Detectron,Python,**Detectron is deprecated. Please see [detectr...,detectron deprecated please see detectronhttps...,detectron deprec pleas see detectronhttpsgithu...,detectron deprecated please see detectronhttps...
10,github/docs,JavaScript,# GitHub Docs <!-- omit in toc -->\n\nThis rep...,github docs omit toc repository contains docum...,github doc omit toc repositori contain documen...,github doc omit toc repository contains docume...
245,taizilongxu/interview_python,Shell,<!-- markdown-toc start - Don't edit this sect...,markdowntoc start dont edit section run mx mar...,markdowntoc start dont edit section run mx mar...,markdowntoc start dont edit section run mx mar...
253,tornadoweb/tornado,Python,Tornado Web Server\n==================\n\n.. i...,tornado web server image httpsbadgesgitterimjo...,tornado web server imag httpsbadgesgitterimjoi...,tornado web server image httpsbadgesgitterimjo...
225,docsifyjs/docsify,JavaScript,"<p align=""center"">\n <a href=""https://docsify...",p aligncenter hrefhttpsdocsifyjsorg img altdoc...,p aligncent hrefhttpsdocsifyjsorg img altdocsi...,p aligncenter hrefhttpsdocsifyjsorg img altdoc...


In [6]:
#getting baseline
y_train.value_counts(normalize=True)

0    0.711297
2    0.158996
1    0.129707
Name: target, dtype: float64

In [7]:
#set column for modeling
X_train_model = X_train.stem
X_validate_model = X_validate.stem
X_test_model = X_test.stem

In [8]:
#make the vectorizer
tfidf = TfidfVectorizer()

In [9]:
#fit and transform the vectorizer
X_train_model = tfidf.fit_transform(X_train_model)
X_validate_model = tfidf.transform(X_validate_model)
X_test_model = tfidf.transform(X_test_model)

In [10]:
#create df with actual values
train_lm = pd.DataFrame(dict(actual=y_train))
validate_lm = pd.DataFrame(dict(actual=y_validate))
test_lm = pd.DataFrame(dict(actual=y_test))

In [11]:
#fit the model
lm = LogisticRegression().fit(X_train_model, y_train)

In [12]:
#add predicted values to df
train_lm['predicted'] = lm.predict(X_train_model)
validate_lm['predicted'] = lm.predict(X_validate_model)
test_lm['predicted'] = lm.predict(X_test_model)

In [13]:
print('Train')
print('Accuracy: {:.2%}'.format(accuracy_score(train_lm.actual, train_lm.predicted)))
print('---')
print(classification_report(train_lm.actual, train_lm.predicted))
print('---')

Train
Accuracy: 72.38%
---
              precision    recall  f1-score   support

           0       0.72      1.00      0.84       170
           1       1.00      0.06      0.12        31
           2       1.00      0.03      0.05        38

    accuracy                           0.72       239
   macro avg       0.91      0.36      0.34       239
weighted avg       0.80      0.72      0.62       239

---


In [14]:
print('Validate')
print('Accuracy: {:.2%}'.format(accuracy_score(validate_lm.actual, validate_lm.predicted)))
print('---')
print(classification_report(validate_lm.actual, validate_lm.predicted))
print('---')

Validate
Accuracy: 77.42%
---
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        23
           1       1.00      0.25      0.40         4
           2       0.00      0.00      0.00         4

    accuracy                           0.77        31
   macro avg       0.59      0.42      0.42        31
weighted avg       0.70      0.77      0.70        31

---


In [15]:
m.run_logistic_model()

Train
Accuracy: 72.38%
---
              precision    recall  f1-score   support

           0       0.72      1.00      0.84       170
           1       1.00      0.06      0.12        31
           2       1.00      0.03      0.05        38

    accuracy                           0.72       239
   macro avg       0.91      0.36      0.34       239
weighted avg       0.80      0.72      0.62       239

---
Validate
Accuracy: 77.42%
---
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        23
           1       1.00      0.25      0.40         4
           2       0.00      0.00      0.00         4

    accuracy                           0.77        31
   macro avg       0.59      0.42      0.42        31
weighted avg       0.70      0.77      0.70        31

---


In [16]:
m.run_logistic_model(feature_column = 'lemmatize')

Train
Accuracy: 72.38%
---
              precision    recall  f1-score   support

           0       0.72      1.00      0.84       170
           1       1.00      0.06      0.12        31
           2       1.00      0.03      0.05        38

    accuracy                           0.72       239
   macro avg       0.91      0.36      0.34       239
weighted avg       0.80      0.72      0.62       239

---
Validate
Accuracy: 77.42%
---
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        23
           1       1.00      0.25      0.40         4
           2       0.00      0.00      0.00         4

    accuracy                           0.77        31
   macro avg       0.59      0.42      0.42        31
weighted avg       0.70      0.77      0.70        31

---


In [17]:
m.run_decisiontree_model()

Train
Accuracy: 83.26%
---
              precision    recall  f1-score   support

           0       0.85      0.94      0.89       170
           1       0.74      0.74      0.74        31
           2       0.80      0.42      0.55        38

    accuracy                           0.83       239
   macro avg       0.80      0.70      0.73       239
weighted avg       0.83      0.83      0.82       239

---
Validate
Accuracy: 70.97%
---
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        23
           1       0.50      0.25      0.33         4
           2       0.33      0.50      0.40         4

    accuracy                           0.71        31
   macro avg       0.55      0.53      0.52        31
weighted avg       0.72      0.71      0.71        31

---


In [18]:
m.run_decisiontree_model(feature_column='lemmatize')

Train
Accuracy: 83.68%
---
              precision    recall  f1-score   support

           0       0.84      0.96      0.90       170
           1       0.81      0.68      0.74        31
           2       0.84      0.42      0.56        38

    accuracy                           0.84       239
   macro avg       0.83      0.69      0.73       239
weighted avg       0.84      0.84      0.82       239

---
Validate
Accuracy: 67.74%
---
              precision    recall  f1-score   support

           0       0.77      0.87      0.82        23
           1       0.00      0.00      0.00         4
           2       0.20      0.25      0.22         4

    accuracy                           0.68        31
   macro avg       0.32      0.37      0.35        31
weighted avg       0.60      0.68      0.63        31

---


In [19]:
m.run_randomforest_model()

Train
Accuracy: 71.13%
---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       170
           1       0.00      0.00      0.00        31
           2       0.00      0.00      0.00        38

    accuracy                           0.71       239
   macro avg       0.24      0.33      0.28       239
weighted avg       0.51      0.71      0.59       239

---
Validate
Accuracy: 74.19%
---
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        23
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4

    accuracy                           0.74        31
   macro avg       0.25      0.33      0.28        31
weighted avg       0.55      0.74      0.63        31

---


In [20]:
m.run_randomforest_model(feature_column='lemmatize')

Train
Accuracy: 71.13%
---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       170
           1       0.00      0.00      0.00        31
           2       0.00      0.00      0.00        38

    accuracy                           0.71       239
   macro avg       0.24      0.33      0.28       239
weighted avg       0.51      0.71      0.59       239

---
Validate
Accuracy: 74.19%
---
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        23
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4

    accuracy                           0.74        31
   macro avg       0.25      0.33      0.28        31
weighted avg       0.55      0.74      0.63        31

---


In [21]:
m.run_naivebayes_model()

Train
Accuracy: 71.13%
---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       170
           1       0.00      0.00      0.00        31
           2       0.00      0.00      0.00        38

    accuracy                           0.71       239
   macro avg       0.24      0.33      0.28       239
weighted avg       0.51      0.71      0.59       239

---
Validate
Accuracy: 74.19%
---
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        23
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4

    accuracy                           0.74        31
   macro avg       0.25      0.33      0.28        31
weighted avg       0.55      0.74      0.63        31

---


In [22]:
m.run_naivebayes_model(feature_column='lemmatize')

Train
Accuracy: 71.13%
---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       170
           1       0.00      0.00      0.00        31
           2       0.00      0.00      0.00        38

    accuracy                           0.71       239
   macro avg       0.24      0.33      0.28       239
weighted avg       0.51      0.71      0.59       239

---
Validate
Accuracy: 74.19%
---
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        23
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4

    accuracy                           0.74        31
   macro avg       0.25      0.33      0.28        31
weighted avg       0.55      0.74      0.63        31

---


In [23]:
m.run_final_test_model()

Accuracy of logistic regression classifier on test set: 0.71
