## Debug for Scientific data
[PeerRead](https://github.com/allenai/PeerRead) (accept/reject)

**Input** : Document string <br>
**Label** : {0:'reject', 1:'accept'}

In [1]:
import os
import logging
import numpy as np
import glob
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from utils.dataset_helper import load_peer_read, load_arxiv

os.environ['TZ'] = 'America/Chicago'
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
help(load_arxiv)

Help on function load_arxiv in module utils.dataset_helper:

load_arxiv(abs_path, tokenize=True, lower=True, shuffle=True, random_state=42)
    Load data for peerRead
    
    # Arguments
        tokenize: Boolean
        lower: Boolean
        shuffle: Boolean
        random_state: Integer
    # Returns
        X_train, X_test, y_train, y_test



In [3]:
PEER_READ_PATH = '/home/anneke/Documents/anneke-precision-github/ann-mitchell-text-classification/dataset/PeerRead/'
x_train, x_test, y_train, y_test = load_arxiv(PEER_READ_PATH,
                                                  shuffle = True, 
                                                  lower = True, 
                                                  tokenize = True)

2019-06-23 17:34:56,100 INFO Loading start...
2019-06-23 17:35:04,217 INFO Training data loaded.
2019-06-23 17:35:04,706 INFO Testing data loaded.
2019-06-23 17:35:04,712 INFO Shuffled.
2019-06-23 17:35:07,201 INFO Lowered.
2019-06-23 17:48:39,124 INFO Tokenized.


In [6]:
print('Total training : {}'.format(len(x_train)))
print('Total testing : {}'.format(len(x_test)))

Total training : 10599
Total testing : 590


In [7]:
np.unique(y_train)

array([1., 2., 3.], dtype=float32)

In [9]:
from utils import utils
import sklearn
X_train, X_test, cv = utils.vectorize_keywords_docs(x_train, x_test, return_cv=True)

In [10]:
X_train['docs'].shape, X_test['docs'].shape

((10599, 9812), (590, 9812))

In [13]:
y_1_ai = [1 if x == 1 else 0 for x in y_train]
y_2_cl = [2 if x == 2 else 0 for x in y_train]
y_3_lg = [3 if x == 3 else 0 for x in y_train]

# Arxiv cs.ai vs rest

In [17]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_1_ai)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]

In [18]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [cs.ai] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [cs.ai] CATEGORY
agent 		 0.814
planning 		 0.701
wish 		 0.693
driving 		 0.687
individuals 		 0.686
con 		 0.683
maker 		 0.681
72 		 0.656
books 		 0.655
areas 		 0.647
reasoning 		 0.638
spatially 		 0.633
999 		 0.627
intelligence 		 0.624
tion 		 0.617
entails 		 0.615
music 		 0.607
conditioned 		 0.6
constitutes 		 0.598
restrictions 		 0.591
s 		 0.59
scheduled 		 0.586
problematic 		 0.582
feeding 		 0.578
expert 		 0.577
enabled 		 0.573
propositional 		 0.57
pool 		 0.568
1e 		 0.567
realized 		 0.563
cancel 		 0.562
stating 		 0.562
rewards 		 0.556
admissible 		 0.554
analytic 		 0.543
owing 		 0.543
mitigated 		 0.541
tackling 		 0.541
submissions 		 0.541
card 		 0.54
li 		 0.538
loop 		 0.537
triplet 		 0.537
pearl 		 0.537
positioning 		 0.534
search 		 0.533
despite 		 0.532
fulfill 		 0.53
finite 		 0.528
relation 		 0.528
reviews 		 0.527
ontology 		 0.526
displacement 		 0.525
ordinal 		 0.521
logic 		 0.519
embedding 		 0.51

## Arxiv cs.cl computational linguistics

In [19]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_2_cl)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [20]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [cs.cl] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [cs.cl] CATEGORY
word 		 0.99
corpus 		 0.941
text 		 0.909
words 		 0.885
language 		 0.821
sentence 		 0.74
lexical 		 0.703
sentences 		 0.682
news 		 0.624
formulation 		 0.613
talk 		 0.583
hmm 		 0.573
utterance 		 0.572
occurrences 		 0.539
computed 		 0.512
speech 		 0.512
documents 		 0.51
acoustic 		 0.506
creation 		 0.494
4 		 0.482
relevant 		 0.473
is 		 0.472
dong 		 0.468
f1 		 0.468
italian 		 0.466
voice 		 0.464
production 		 0.457
73 		 0.457
linguistics 		 0.456
meet 		 0.45
types 		 0.446
cell 		 0.439
compact 		 0.438
86 		 0.435
while 		 0.431
module 		 0.429
affect 		 0.427
default 		 0.426
english 		 0.425
cnn 		 0.424
rather 		 0.423
improvement 		 0.42
number 		 0.418
centre 		 0.416
interfaces 		 0.414
tense 		 0.414
speed 		 0.414
lang 		 0.409
performs 		 0.408
work 		 0.406
table 		 0.406
simplified 		 0.403
semantic 		 0.402
lexicon 		 0.402
workshop 		 0.402
errors 		 0.401
dev 		 0.4
gold 		 0.4
θ 		 0.399
70 		 

# Arxiv cs.lg

In [21]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_3_lg)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [22]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [cs.lg] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [cs.lg] CATEGORY
learning 		 1.375
proving 		 0.726
html 		 0.681
58 		 0.68
copyright 		 0.679
unlikely 		 0.672
participated 		 0.672
ie 		 0.648
naively 		 0.645
clean 		 0.644
field 		 0.639
sigmoid 		 0.636
yields 		 0.631
categorization 		 0.63
subsets 		 0.611
coding 		 0.611
propagates 		 0.609
limitation 		 0.607
obtaining 		 0.606
js 		 0.601
audio 		 0.593
sensing 		 0.592
duality 		 0.589
primitives 		 0.587
draws 		 0.581
centers 		 0.577
aiming 		 0.577
deep 		 0.576
missing 		 0.574
training 		 0.573
sdp 		 0.571
claimed 		 0.563
generalisation 		 0.559
herein 		 0.558
f2 		 0.557
images 		 0.552
unchanged 		 0.551
gibbs 		 0.551
professor 		 0.549
rnn 		 0.547
inherited 		 0.547
exploration 		 0.545
cheaper 		 0.544
termed 		 0.544
popularity 		 0.543
convexity 		 0.542
exhibited 		 0.54
categorical 		 0.539
clip 		 0.538
weaknesses 		 0.537
misclassification 		 0.534
went 		 0.531
argue 		 0.53
dx 		 0.53
re 		 0.529
thin 		 0.529

## Logistic Regression


In [7]:
from utils import utils
import sklearn

In [9]:
X_train, X_test, cv = utils.vectorize_keywords_docs(x_train, x_test, return_cv=True)

In [10]:
X_train['docs'].shape, X_test['docs'].shape

((11090, 10572), (637, 10572))

In [11]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
weight = clf.coef_[0]
words = cv.get_feature_names()

### Print top 100

### Reminder
``{0: 'rejected', 1:'accepted'}``

In [18]:
indices = np.argsort(np.absolute(weight))[::-1]

(10572,)

In [34]:
for i in indices[:100]:
    print('{} \t {}'.format(words[i], np.around(weight[i], 3)))

cs 	 -1.371
copyright 	 1.089
st 	 0.997
2016 	 0.817
figure 	 0.796
translations 	 0.794
61 	 -0.77
17 	 -0.761
moment 	 -0.755
owner 	 0.735
gram 	 -0.73
encountered 	 -0.726
scotland 	 0.725
edinburgh 	 0.725
advancement 	 0.724
experiments 	 0.721
ov 	 -0.71
regime 	 0.708
triples 	 0.705
novel 	 0.688
acknowledgments 	 0.683
addressing 	 -0.669
semantically 	 0.669
performance 	 0.662
reviewers 	 0.662
introduction 	 0.655
remarks 	 -0.651
synthetic 	 0.645
2012 	 0.634
v1 	 0.63
aaai 	 0.626
hyperparameters 	 0.626
phenomena 	 0.625
now 	 0.622
subspaces 	 -0.621
ieee 	 -0.621
subroutine 	 0.615
straight 	 -0.613
surrogate 	 0.613
tractable 	 0.61
255 	 -0.608
annotators 	 0.605
compute 	 0.604
2014 	 0.602
barcelona 	 0.602
assigns 	 -0.602
argmin 	 -0.6
neural 	 0.599
ju 	 0.596
e 	 0.596
supplementary 	 0.591
accordingly 	 0.59
distortion 	 0.588
does 	 0.579
operate 	 0.577
prepositions 	 0.575
describes 	 -0.569
challenging 	 0.569
70 	 -0.563
divergence 	 -0.562
ordering 	 

# More

In [35]:
for i in indices[100:500]:
    print('{} \t {}'.format(words[i], np.around(weight[i], 3)))

stream 	 -0.517
robustly 	 0.517
revisiting 	 -0.517
vol 	 -0.516
characterizes 	 -0.516
conversational 	 -0.515
n1 	 0.515
instability 	 0.513
raw 	 -0.511
internet 	 -0.508
encoding 	 0.506
commons 	 -0.505
simplification 	 -0.505
analogously 	 0.504
clipping 	 0.504
removes 	 -0.503
modelled 	 -0.502
passing 	 0.501
normalized 	 -0.5
power 	 0.496
biological 	 -0.496
jia 	 0.495
technology 	 -0.495
notably 	 0.494
sketch 	 0.494
confirm 	 0.493
intractable 	 0.492
author 	 0.49
workshop 	 -0.49
engineered 	 0.488
qi 	 -0.487
ep 	 0.487
2003 	 0.486
remainder 	 -0.485
imposed 	 0.484
recovered 	 0.484
soft 	 -0.484
biased 	 0.484
claim 	 -0.483
descriptor 	 -0.482
describing 	 -0.482
dataset 	 0.482
shifting 	 0.481
aligned 	 -0.48
tighter 	 -0.479
1996 	 0.479
1n 	 0.479
rmsprop 	 0.479
left 	 0.478
parametric 	 0.477
reinforcement 	 0.476
article 	 -0.476
continues 	 -0.475
isn 	 0.475
excluded 	 0.475
recorded 	 -0.473
scenes 	 -0.473
entire 	 0.472
proceedings 	 0.472
2k 	 0.472
