## Debug for Scientific data
[PeerRead](https://github.com/allenai/PeerRead) (accept/reject)

**Input** : Document string <br>
**Label** : {0:'reject', 1:'accept'}

In [1]:
import os
import logging
import numpy as np
import glob
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from utils.dataset_helper import load_peer_read

os.environ['TZ'] = 'America/Chicago'
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
help(load_peer_read)

Help on function load_peer_read in module utils.dataset_helper:

load_peer_read(abs_path, tokenize=True, lower=True, shuffle=True, random_state=42)
    Load data for peerRead
    
    # Arguments
        tokenize: Boolean
        lower: Boolean
        shuffle: Boolean
        random_state: Integer
    # Returns
        X_train, X_test, y_train, y_test



In [3]:
PEER_READ_PATH = '/home/anneke/Documents/anneke-precision-github/ann-mitchell-text-classification/dataset/PeerRead/'
x_train, x_test, y_train, y_test = load_peer_read(PEER_READ_PATH,
                                                  shuffle = True, 
                                                  lower = True, 
                                                  tokenize = True)

2019-06-19 13:28:17,506 INFO Loading start...
2019-06-19 13:28:26,512 INFO Training data loaded.
2019-06-19 13:28:27,089 INFO Testing data loaded.
2019-06-19 13:28:27,096 INFO Shuffled.
2019-06-19 13:28:29,593 INFO Lowered.
2019-06-19 13:42:35,671 INFO Tokenized.


In [4]:
print('Total training : {}'.format(len(x_train)))
print('Total testing : {}'.format(len(x_test)))

Total training : 11090
Total testing : 637


## Logistic Regression


In [7]:
from utils import utils
import sklearn

In [9]:
X_train, X_test, cv = utils.vectorize_keywords_docs(x_train, x_test, return_cv=True)

In [10]:
X_train['docs'].shape, X_test['docs'].shape

((11090, 10572), (637, 10572))

In [11]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
weight = clf.coef_[0]
words = cv.get_feature_names()

### Print top 100

### Reminder
``{0: 'rejected', 1:'accepted'}``

In [18]:
indices = np.argsort(np.absolute(weight))[::-1]

(10572,)

In [34]:
for i in indices[:100]:
    print('{} \t {}'.format(words[i], np.around(weight[i], 3)))

cs 	 -1.371
copyright 	 1.089
st 	 0.997
2016 	 0.817
figure 	 0.796
translations 	 0.794
61 	 -0.77
17 	 -0.761
moment 	 -0.755
owner 	 0.735
gram 	 -0.73
encountered 	 -0.726
scotland 	 0.725
edinburgh 	 0.725
advancement 	 0.724
experiments 	 0.721
ov 	 -0.71
regime 	 0.708
triples 	 0.705
novel 	 0.688
acknowledgments 	 0.683
addressing 	 -0.669
semantically 	 0.669
performance 	 0.662
reviewers 	 0.662
introduction 	 0.655
remarks 	 -0.651
synthetic 	 0.645
2012 	 0.634
v1 	 0.63
aaai 	 0.626
hyperparameters 	 0.626
phenomena 	 0.625
now 	 0.622
subspaces 	 -0.621
ieee 	 -0.621
subroutine 	 0.615
straight 	 -0.613
surrogate 	 0.613
tractable 	 0.61
255 	 -0.608
annotators 	 0.605
compute 	 0.604
2014 	 0.602
barcelona 	 0.602
assigns 	 -0.602
argmin 	 -0.6
neural 	 0.599
ju 	 0.596
e 	 0.596
supplementary 	 0.591
accordingly 	 0.59
distortion 	 0.588
does 	 0.579
operate 	 0.577
prepositions 	 0.575
describes 	 -0.569
challenging 	 0.569
70 	 -0.563
divergence 	 -0.562
ordering 	 

# More

In [35]:
for i in indices[100:500]:
    print('{} \t {}'.format(words[i], np.around(weight[i], 3)))

stream 	 -0.517
robustly 	 0.517
revisiting 	 -0.517
vol 	 -0.516
characterizes 	 -0.516
conversational 	 -0.515
n1 	 0.515
instability 	 0.513
raw 	 -0.511
internet 	 -0.508
encoding 	 0.506
commons 	 -0.505
simplification 	 -0.505
analogously 	 0.504
clipping 	 0.504
removes 	 -0.503
modelled 	 -0.502
passing 	 0.501
normalized 	 -0.5
power 	 0.496
biological 	 -0.496
jia 	 0.495
technology 	 -0.495
notably 	 0.494
sketch 	 0.494
confirm 	 0.493
intractable 	 0.492
author 	 0.49
workshop 	 -0.49
engineered 	 0.488
qi 	 -0.487
ep 	 0.487
2003 	 0.486
remainder 	 -0.485
imposed 	 0.484
recovered 	 0.484
soft 	 -0.484
biased 	 0.484
claim 	 -0.483
descriptor 	 -0.482
describing 	 -0.482
dataset 	 0.482
shifting 	 0.481
aligned 	 -0.48
tighter 	 -0.479
1996 	 0.479
1n 	 0.479
rmsprop 	 0.479
left 	 0.478
parametric 	 0.477
reinforcement 	 0.476
article 	 -0.476
continues 	 -0.475
isn 	 0.475
excluded 	 0.475
recorded 	 -0.473
scenes 	 -0.473
entire 	 0.472
proceedings 	 0.472
2k 	 0.472
