## CountVectorizer + NaiveBayes

In [7]:
%pip install pandas
%pip install nltk
%pip install sklearn

import random
import pandas as pd
import nltk
nltk.download('treebank')
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split

description_df = pd.read_csv('./training_corpus/description.csv')
installation_df = pd.read_csv('./training_corpus/installation.csv')
invocation_df = pd.read_csv('./training_corpus/invocation.csv')
citation_df = pd.read_csv('./training_corpus/citation.csv')

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\magarcia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


## Data Preview
Make sure that csv data has been successfully imported.

In [8]:
print("Number of description entries: {}".format(len(description_df)))
print("Number of installation entries: {}".format(len(installation_df)))
print("Number of invocation entries: {}".format(len(invocation_df)))
print("Number of citation entries: {}".format(len(citation_df)))
description_df.head()

Number of description entries: 336
Number of installation entries: 929
Number of invocation entries: 1134
Number of citation entries: 316


Unnamed: 0,URL,contributor,excerpt
0,https://github.com/GoogleChrome/puppeteer,Allen Mao,Puppeteer is a Node library which provides a h...
1,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The major contributors of this repository incl...
2,https://github.com/JimmySuen/integral-human-pose,Allen Mao,Integral Regression is initially described in ...
3,https://github.com/JimmySuen/integral-human-pose,Allen Mao,We build a 3D pose estimation system based mai...
4,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The Integral Regression is also known as soft-...


In [9]:
print("Number of installation entries: {}".format(len(installation_df)))
installation_df.head()

Number of installation entries: 929


Unnamed: 0,URL,contributor,excerpt
0,https://github.com/GoogleChrome/puppeteer,Allen Mao,Installation
1,https://github.com/GoogleChrome/puppeteer,Allen Mao,"To use Puppeteer in your project, run:"
2,https://github.com/GoogleChrome/puppeteer,Allen Mao,npm i puppeteer
3,https://github.com/GoogleChrome/puppeteer,Allen Mao,"# or ""yarn add puppeteer"""
4,https://github.com/GoogleChrome/puppeteer,Allen Mao,puppeteer-core


In [10]:
print("Number of invocation entries: {}".format(len(invocation_df)))
invocation_df.head()

Number of invocation entries: 1134


Unnamed: 0,URL,contributor,excerpt
0,https://github.com/JimmySuen/integral-human-pose,Allen Mao,Usage
1,https://github.com/JimmySuen/integral-human-pose,Allen Mao,We have placed some example config files in ex...
2,https://github.com/JimmySuen/integral-human-pose,Allen Mao,Train
3,https://github.com/JimmySuen/integral-human-pose,Allen Mao,"For Integral Human Pose Regression, cd to pyto..."
4,https://github.com/JimmySuen/integral-human-pose,Allen Mao,Integral Regression


In [11]:
print("Number of citation entries: {}".format(len(citation_df)))
citation_df.head()

Number of citation entries: 316


Unnamed: 0,URL,contributor,excerpt
0,https://github.com/JimmySuen/integral-human-pose,Allen Mao,If you find Integral Regression useful in your...
1,https://github.com/JimmySuen/integral-human-pose,Allen Mao,"@article{sun2017integral,"
2,https://github.com/JimmySuen/integral-human-pose,Allen Mao,"title={Integral human pose regression},"
3,https://github.com/JimmySuen/integral-human-pose,Allen Mao,"author={Sun, Xiao and Xiao, Bin and Liang, Shu..."
4,https://github.com/JimmySuen/integral-human-pose,Allen Mao,"journal={arXiv preprint arXiv:1711.08229},"


# Classifier Pipelines

In [12]:
import numpy as np
import pickle
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split #can add stratified later
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from setup_corpus import build_corpora
corpora = build_corpora()
# print(corpora)

Selected Category: description
description has 336 samples;
installation has 84 samples;
invocation has 84 samples;
citation has 84 samples;


  categories_df[category].drop('URL', 1, inplace=True)


Selected Category: installation
description has 232 samples;
installation has 929 samples;
invocation has 232 samples;
citation has 232 samples;
Selected Category: invocation
description has 283 samples;
installation has 283 samples;
invocation has 1134 samples;
citation has 283 samples;
Selected Category: citation
description has 79 samples;
installation has 79 samples;
invocation has 79 samples;
citation has 316 samples;


In [13]:
%pwd

'C:\\Users\\magarcia\\somef-magarcia\\experiments'

In [39]:
import pickle

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

def evaluate(corpora,pipeline,name):
    dec = 3
    cv = StratifiedKFold(n_splits = 5, shuffle=True)
    for category in corpora:
        X = corpora[category].excerpt
        Y = corpora[category][category]
        print("\n",category,"X",len(X),"Y",len(Y))
        X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2)
        pipeline.fit(X_train, y_train)
        title = "./trained_models/"+category[:3]+name+".p"
        print(title)
        pickle.dump(pipeline, open(title, 'wb+'))
        scores = cross_validate(pipeline, X, Y, cv=cv, scoring = scoring)
        print("Mean test accuracy:",np.around(scores["test_accuracy"].mean(),decimals=dec),"\nPrecision",np.around(scores["test_precision"].mean(),decimals=dec),"\nRecall",np.around(scores["test_recall"].mean(),decimals=dec),"\nF-measure",np.around(scores["test_f1_score"].mean(),decimals=dec))


## CountVectorizer + LogisticRegression

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


pipeline = make_pipeline(CountVectorizer(), LogisticRegression(solver='liblinear'))
cv1 = StratifiedKFold(n_splits = 5, shuffle=True)
name = "cvlr"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/descvlr.p
Mean test accuracy: 0.808 
Precision 0.865 
Recall 0.792 
F-measure 0.824

 installation X 1625 Y 1625
./test_models/inscvlr.p
Mean test accuracy: 0.877 
Precision 0.881 
Recall 0.909 
F-measure 0.894

 invocation X 1983 Y 1983
./test_models/invcvlr.p
Mean test accuracy: 0.843 
Precision 0.82 
Recall 0.929 
F-measure 0.871

 citation X 553 Y 553
./test_models/citcvlr.p
Mean test accuracy: 0.854 
Precision 0.821 
Recall 0.956 
F-measure 0.882


Description: 81
Installation: 84
Invocation: 83
Citation: 90
[81 86 83 85]
[81 86 84 86]
[82 89 86 90]
[82 86 85 86]
[75 90 86 86]

## TFIDF + LogisticRegression

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tflr"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/destflr.p
Mean test accuracy: 0.81 
Precision 0.79 
Recall 0.914 
F-measure 0.847

 installation X 1625 Y 1625
./test_models/instflr.p
Mean test accuracy: 0.9 
Precision 0.917 
Recall 0.909 
F-measure 0.912

 invocation X 1983 Y 1983
./test_models/invtflr.p
Mean test accuracy: 0.851 
Precision 0.826 
Recall 0.937 
F-measure 0.878

 citation X 553 Y 553
./test_models/cittflr.p
Mean test accuracy: 0.859 
Precision 0.82 
Recall 0.968 
F-measure 0.887


## TFIDF + NaiveBayes

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfnb"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/destfnb.p
Mean test accuracy: 0.769 
Precision 0.716 
Recall 0.988 
F-measure 0.83

 installation X 1625 Y 1625
./test_models/instfnb.p
Mean test accuracy: 0.854 
Precision 0.807 
Recall 0.982 
F-measure 0.885

 invocation X 1983 Y 1983
./test_models/invtfnb.p
Mean test accuracy: 0.868 
Precision 0.845 
Recall 0.942 
F-measure 0.891

 citation X 553 Y 553
./test_models/cittfnb.p
Mean test accuracy: 0.863 
Precision 0.827 
Recall 0.962 
F-measure 0.889


In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
pipeline = make_pipeline(CountVectorizer(), MultinomialNB())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "cvnb"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/descvnb.p
Mean test accuracy: 0.796 
Precision 0.756 
Recall 0.95 
F-measure 0.842

 installation X 1625 Y 1625
./test_models/inscvnb.p
Mean test accuracy: 0.891 
Precision 0.865 
Recall 0.96 
F-measure 0.91

 invocation X 1983 Y 1983
./test_models/invcvnb.p
Mean test accuracy: 0.871 
Precision 0.881 
Recall 0.897 
F-measure 0.889

 citation X 553 Y 553
./test_models/citcvnb.p
Mean test accuracy: 0.901 
Precision 0.882 
Recall 0.956 
F-measure 0.917


## CountVectorizer + BernoulliBayes

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
pipeline = make_pipeline(CountVectorizer(), BernoulliNB())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "cvbb"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/descvbb.p
Mean test accuracy: 0.719 
Precision 0.909 
Recall 0.566 
F-measure 0.697

 installation X 1625 Y 1625
./test_models/inscvbb.p
Mean test accuracy: 0.75 
Precision 0.701 
Recall 0.985 
F-measure 0.819

 invocation X 1983 Y 1983
./test_models/invcvbb.p
Mean test accuracy: 0.766 
Precision 0.728 
Recall 0.943 
F-measure 0.822

 citation X 553 Y 553
./test_models/citcvbb.p
Mean test accuracy: 0.743 
Precision 0.694 
Recall 0.987 
F-measure 0.815


## TFIDF + Stochastic Gradient Descent

In [45]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log'))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfsgd"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/destfsgd.p
Mean test accuracy: 0.838 
Precision 0.844 
Recall 0.884 
F-measure 0.862

 installation X 1625 Y 1625
./test_models/instfsgd.p
Mean test accuracy: 0.902 
Precision 0.92 
Recall 0.906 
F-measure 0.913

 invocation X 1983 Y 1983
./test_models/invtfsgd.p
Mean test accuracy: 0.868 
Precision 0.857 
Recall 0.923 
F-measure 0.889

 citation X 553 Y 553
./test_models/cittfsgd.p
Mean test accuracy: 0.879 
Precision 0.845 
Recall 0.965 
F-measure 0.901


## TFIDF + XGB

In [46]:
%pip install xgboost

from xgboost.sklearn import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = make_pipeline(TfidfVectorizer(), XGBClassifier(use_label_encoder=False,eval_metric="logloss"))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfxgb"
evaluate(corpora,pipeline,name)

Note: you may need to restart the kernel to use updated packages.

 description X 588 Y 588
./test_models/destfxgb.p
Mean test accuracy: 0.772 
Precision 0.823 
Recall 0.768 
F-measure 0.794

 installation X 1625 Y 1625
./test_models/instfxgb.p
Mean test accuracy: 0.828 
Precision 0.9 
Recall 0.786 
F-measure 0.839

 invocation X 1983 Y 1983
./test_models/invtfxgb.p
Mean test accuracy: 0.798 
Precision 0.779 
Recall 0.902 
F-measure 0.836

 citation X 553 Y 553
./test_models/cittfxgb.p
Mean test accuracy: 0.807 
Precision 0.785 
Recall 0.911 
F-measure 0.843


## Perceptron + TFIDF

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
pipeline = make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfper"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/destfper.p
Mean test accuracy: 0.804 
Precision 0.829 
Recall 0.834 
F-measure 0.829

 installation X 1625 Y 1625
./test_models/instfper.p
Mean test accuracy: 0.868 
Precision 0.887 
Recall 0.882 
F-measure 0.884

 invocation X 1983 Y 1983
./test_models/invtfper.p
Mean test accuracy: 0.837 
Precision 0.864 
Recall 0.848 
F-measure 0.856

 citation X 553 Y 553
./test_models/cittfper.p
Mean test accuracy: 0.864 
Precision 0.85 
Recall 0.93 
F-measure 0.888


## Random Forest Classifier +TFIDF

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier())  #(max_depth=3, random_state=0))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfrfc"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/destfrfc.p
Mean test accuracy: 0.787 
Precision 0.835 
Recall 0.783 
F-measure 0.807

 installation X 1625 Y 1625
./test_models/instfrfc.p
Mean test accuracy: 0.866 
Precision 0.913 
Recall 0.846 
F-measure 0.878

 invocation X 1983 Y 1983
./test_models/invtfrfc.p
Mean test accuracy: 0.83 
Precision 0.868 
Recall 0.828 
F-measure 0.847

 citation X 553 Y 553
./test_models/cittfrfc.p
Mean test accuracy: 0.805 
Precision 0.766 
Recall 0.953 
F-measure 0.848


## Decision Tree Classifier +TFIDF

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
pipeline = make_pipeline(CountVectorizer(), DecisionTreeClassifier())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfdtc"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/destfdtc.p
Mean test accuracy: 0.765 
Precision 0.813 
Recall 0.768 
F-measure 0.788

 installation X 1625 Y 1625
./test_models/instfdtc.p
Mean test accuracy: 0.822 
Precision 0.887 
Recall 0.789 
F-measure 0.835

 invocation X 1983 Y 1983
./test_models/invtfdtc.p
Mean test accuracy: 0.754 
Precision 0.837 
Recall 0.709 
F-measure 0.767

 citation X 553 Y 553
./test_models/cittfdtc.p
Mean test accuracy: 0.828 
Precision 0.811 
Recall 0.914 
F-measure 0.859


## TFIDF + AdaBoostClassifier

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
pipeline = make_pipeline(TfidfVectorizer(), AdaBoostClassifier())  #(max_depth=3, random_state=0))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfada"
evaluate(corpora,pipeline,name)


 description X 588 Y 588
./test_models/destfada.p
Mean test accuracy: 0.771 
Precision 0.822 
Recall 0.762 
F-measure 0.791

 installation X 1625 Y 1625
./test_models/instfada.p
Mean test accuracy: 0.793 
Precision 0.906 
Recall 0.713 
F-measure 0.797

 invocation X 1983 Y 1983
./test_models/invtfada.p
Mean test accuracy: 0.779 
Precision 0.762 
Recall 0.893 
F-measure 0.822

 citation X 553 Y 553
./test_models/cittfada.p
Mean test accuracy: 0.835 
Precision 0.938 
Recall 0.763 
F-measure 0.841
