In [7]:
# all the imports to be able to follow along with https://www.kaggle.com/thebrownviking20/topic-modelling-with-spacy-and-scikit-learn

Show our data:  
http://localhost:8889/notebooks/data_analysis.ipynb#show-our-data-frame

Show extracted entities:  
http://localhost:8889/notebooks/data_analysis.ipynb#look-at-entity-extraction-from-the-doc  

Example of predictions:  
http://localhost:8889/notebooks/data_analysis.ipynb#look-at-some-confusion-matrix-and-accuracy-scores 

The core model:  
http://localhost:8889/notebooks/data_analysis.ipynb#show-the-core-model  

Show how bad our predictions are:  
http://localhost:8889/notebooks/data_analysis.ipynb#Show-how-poor-the-model-preforms  

In [6]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

In [7]:
%matplotlib inline
import os

In [8]:
# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [9]:
# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [10]:
import scispacy
import spacy

In [6]:
!python -m spacy link en_core_sci_sm en


[93m    Link 'en' already exists[0m
    To overwrite an existing link, use the --force flag.



In [11]:
nlp = spacy.load("en")

In [16]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


# OK, we have the machinery, let's start looking at the data

In [12]:
%store  -r

In [13]:
# let's get this into a data frame with classifications

In [14]:
labels = []
abstracts = [] 
for abstract in unique_selected_abstracts:
    abstracts.append(abstract)
    labels.append("selected")
for abstract in rejected_abstracts:
    abstracts.append(abstract)
    labels.append("rejected")

In [17]:
trainDF = pandas.DataFrame()

In [18]:
trainDF[0:10]

In [13]:
trainEntityDF = pandas.DataFrame()

# create a training set based on entties extracted using scispacy

In [19]:
entity_abstracts = [] 
for abstract in abstracts:
    doc = nlp(abstract)
    entity_abstract = doc.ents
    entity_abstracts.append(entity_abstract)

In [20]:
entity_abstract_sentences = []
for entity_abstract in entity_abstracts:
    entity_abstract_sentence =  " ".join([x.lemma_ for x in entity_abstract])
    entity_abstract_sentences.append(entity_abstract_sentence)

In [16]:
entity_abstract_sentences[9]

'transition process western balkan country conflict transition-related change life expectancy country mortality non-communicable disease ncd comparison western northern part europe study treatment ncd western balkan country secondary aim compare policy prescribing-related competency family physician document analysis national positive medicine list strategic document clinical guideline treatment frequent ncd arterial hypertension diabetes hyperlipidemia asthma chronic obstructive pulmonary disease copd phrase medicine prescribe extract domain medicine availability prescribe policy medication prescribing-related competency possibility treatment arterial hypertension diabetes hyperlipidemia asthma copd western balkan country variance register medicine combination restriction family physician prescribe insulin inhale corticosteroid statin angiotensin ii receptor blocker arb recommendation western balkan country essential medicine treatment ncd partial reimbursement exception statin oral a

In [21]:
trainDF['text'] = abstracts
trainDF['label'] = labels

# show our data frame

In [25]:
trainDF[0:10]

Unnamed: 0,text,label
0,Respondent-driven sampling (RDS) is an approac...,selected
1,Purpose: Post-traumatic stress disorder (PTSD)...,selected
2,Background: The impact of intimate partner vio...,selected
3,Retrospectively measuring markers on stored ba...,selected
4,Background: This study investigated the risk f...,selected
5,Road traffic injuries are the leading cause of...,selected
6,A government's response to increasing incidenc...,selected
7,Magnetic resonance imaging (MRI) has evolved r...,selected
8,"We introduce a non-myopic, covariate-adjusted ...",selected
9,"Background: During the transition processes, t...",selected


In [19]:
trainEntityDF['text'] = entity_abstract_sentences
trainEntityDF['label'] = labels

In [20]:
trainEntityDF[9:12]

Unnamed: 0,text,label
9,transition process western balkan country conf...,selected
10,infer dependence structure undirected graph mo...,selected
11,study impact california 's patient sph legisla...,selected


In [21]:
trainDF[9:12]

Unnamed: 0,text,label
9,"Background: During the transition processes, t...",selected
10,Inferring dependence structure through undirec...,selected
11,Background This study evaluated the impact of ...,selected


# Try some named entity recognition

In [22]:
stopwords = list(STOP_WORDS)

In [23]:
import string
punctuations = string.punctuation

In [24]:
from spacy.lang.en import English
parser = English()

In [25]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [26]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [27]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

### Some notes on tuning SVC  

- our data set is small 
- there is a lot of configuration that we could do with SVC and at the moment I'm pottering around with some hand optimisation, e.g. https://github.com/scikit-learn/scikit-learn/issues/4800  
- note on predict_proba comes from here: https://stackoverflow.com/questions/15015710/how-can-i-know-probability-of-class-predicted-by-predict-function-in-support-v  

Should try to do:
- grid search. 
- feat_importance 


In [28]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
# classifier = LinearSVC
classifier = SVC(probability=True, class_weight="balanced",C=0.1, gamma= 0.01)
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [30]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

##

# create a training model and predictions based on the abstracts from christine

In [31]:
# Features and Labels
X = trainDF['text']
ylabels = trainDF['label']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.15, random_state=42)

In [94]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier)])

In [34]:
# Fit our data
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x1c63be8828>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngra...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

### get predictions on the holdout, from pred and pred_proba

In [35]:
# Predicting with a test dataset
y_pred = pipe.predict(X_test)

In [36]:
y_pred_prob = pipe.predict_proba(X_test)

In [37]:
y_pred_prob[0:4]

array([[0.34003851, 0.65996149],
       [0.34999079, 0.65000921],
       [0.34675157, 0.65324843],
       [0.35500318, 0.64499682]])

## prep the probablity scores to look like predictions

In [38]:
y_pred_prob_decision = []
for prob in y_pred_prob: 
    if prob[0] > 0.5:
        y_pred_prob_decision.append("rejected")
    else:
        y_pred_prob_decision.append("selected")

## look at some confusion matrix and accuracy scores

In [39]:
metrics.confusion_matrix(y_pred, y_test)

array([[  4,   0],
       [ 40, 117]])

In [40]:
metrics.confusion_matrix(y_pred_prob_decision, y_test)

array([[  0,   0],
       [ 44, 117]])

In [41]:
metrics.accuracy_score(y_test, y_pred_prob_decision)

0.7267080745341615

In [42]:
metrics.accuracy_score(y_test, y_pred)

0.7515527950310559

In [43]:
# Prediction Results, compare to test actual answere
cutoff_values = []
cutoff_predictions = [] 
for (sample, pred, proba_pred, weight, real) in zip(X_test, y_pred, y_pred_prob_decision, y_pred_prob, y_test):
    print(sample[0:15], ":" , pred, proba_pred, weight, real)

ObjectivesTo pe : selected selected [0.34003851 0.65996149] selected
Inappropriate c : selected selected [0.34999079 0.65000921] selected
Background Nois : selected selected [0.34675157 0.65324843] selected
Background: Und : selected selected [0.35500318 0.64499682] rejected
Introduction: A : selected selected [0.35345541 0.64654459] selected
BackgroundAccur : selected selected [0.3471647 0.6528353] selected
Autonomic nervo : selected selected [0.35372527 0.64627473] selected
A wide breadth  : selected selected [0.34342329 0.65657671] rejected
The early detec : rejected selected [0.33187061 0.66812939] rejected
Background: The : selected selected [0.35062668 0.64937332] rejected
Introduction: O : selected selected [0.35512633 0.64487367] selected
Background: Evi : selected selected [0.34940587 0.65059413] selected
BackgroundIron  : selected selected [0.34598072 0.65401928] selected
The timing of a : selected selected [0.35512325 0.64487675] selected
Introduction: E : selected selected 

In [44]:
# Prediction Results, compare to test actual answere
cutoff_values = []
cutoff_predictions = [] 
for (sample, pred, proba_pred, weight, real) in zip(X_test, y_pred, y_pred_prob_decision, y_pred_prob, y_test):
    if weight[1] > 0.8:
        print(sample[0:15], ":" , pred, proba_pred, weight, real)
        cutoff_values.append(real)
        cutoff_predictions.append(proba_pred)
print(metrics.accuracy_score(cutoff_values, cutoff_predictions))
print(metrics.confusion_matrix(cutoff_values, cutoff_predictions))

nan
[]


##

# create a model based on the abstracts as entities only 

In [45]:
# Features and Labels
Xe = trainEntityDF['text']
yelabels = trainEntityDF['label']

In [46]:
Xe_train, Xe_test, ye_train, ye_test = train_test_split(Xe, yelabels, test_size=0.15, random_state=42)

In [47]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe_e = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [48]:
# Fit our data
pipe_e.fit(Xe_train,ye_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x1c64174eb8>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngra...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

### get predictions on the holdout, from pred and pred_proba

In [49]:
# Predicting with a test dataset
ye_pred = pipe_e.predict(Xe_test)

In [50]:
ye_pred_prob = pipe_e.predict_proba(Xe_test)

In [51]:
ye_pred_prob[0:4]

array([[0.34255605, 0.65744395],
       [0.34001164, 0.65998836],
       [0.34075909, 0.65924091],
       [0.33772167, 0.66227833]])

## prep the probablity scores to look like predictions

In [52]:
ye_pred_prob_decision = []
for prob in ye_pred_prob: 
    if prob[0] > 0.5:
        ye_pred_prob_decision.append("rejected")
    else:
        ye_pred_prob_decision.append("selected")

## look at some confusion matrix and accuracy scores

In [53]:
metrics.confusion_matrix(ye_pred, ye_test)

array([[  9,   2],
       [ 35, 115]])

In [54]:
metrics.confusion_matrix(ye_pred_prob_decision, ye_test)

array([[  0,   0],
       [ 44, 117]])

In [55]:
metrics.accuracy_score(ye_test, ye_pred_prob_decision)

0.7267080745341615

In [56]:
metrics.accuracy_score(ye_test, ye_pred)

0.7701863354037267

In [57]:
# Prediction Results, compare to test actual answere
cutoff_values = []
cutoff_predictions = [] 
for (sample, pred, proba_pred, weight, real) in zip(Xe_test, ye_pred, ye_pred_prob_decision, ye_pred_prob, ye_test):
    print(sample[0:15], ":" , pred, proba_pred, weight, real)

objectivesto ca : selected selected [0.34255605 0.65744395] selected
inappropriate w : selected selected [0.34001164 0.65998836] selected
noise-induced h : selected selected [0.34075909 0.65924091] selected
undernutrition  : selected selected [0.33772167 0.66227833] rejected
introduction ad : selected selected [0.3382987 0.6617013] selected
backgroundaccur : selected selected [0.34069611 0.65930389] selected
autonomic nervo : selected selected [0.33816684 0.66183316] selected
research seclus : rejected selected [0.34427393 0.65572607] rejected
early detection : rejected selected [0.34493915 0.65506085] rejected
threat meticill : selected selected [0.33979535 0.66020465] rejected
introduction or : selected selected [0.33758722 0.66241278] selected
evidence subacu : selected selected [0.33977655 0.66022345] selected
backgroundiron  : selected selected [0.3397997 0.6602003] selected
timing antiretr : selected selected [0.33766714 0.66233286] selected
introduction ef : selected selected [0

In [58]:
# Prediction Results, compare to test actual answere
cutoff_values = []
cutoff_predictions = [] 
count = 0 
for (sample, pred, proba_pred, weight, real) in zip(Xe_test, ye_pred, ye_pred_prob_decision, ye_pred_prob, ye_test):
    if weight[1] > 0.6:
        print(sample[0:15], ":" , pred, proba_pred, weight, real)
        cutoff_values.append(real)
        cutoff_predictions.append(proba_pred)
        count += 1
print(metrics.accuracy_score(cutoff_values, cutoff_predictions))
print(metrics.confusion_matrix(cutoff_values, cutoff_predictions))
print (float(count/len(sample)))

objectivesto ca : selected selected [0.34255605 0.65744395] selected
inappropriate w : selected selected [0.34001164 0.65998836] selected
noise-induced h : selected selected [0.34075909 0.65924091] selected
undernutrition  : selected selected [0.33772167 0.66227833] rejected
introduction ad : selected selected [0.3382987 0.6617013] selected
backgroundaccur : selected selected [0.34069611 0.65930389] selected
autonomic nervo : selected selected [0.33816684 0.66183316] selected
research seclus : rejected selected [0.34427393 0.65572607] rejected
early detection : rejected selected [0.34493915 0.65506085] rejected
threat meticill : selected selected [0.33979535 0.66020465] rejected
introduction or : selected selected [0.33758722 0.66241278] selected
evidence subacu : selected selected [0.33977655 0.66022345] selected
backgroundiron  : selected selected [0.3397997 0.6602003] selected
timing antiretr : selected selected [0.33766714 0.66233286] selected
introduction ef : selected selected [0

introduction ta : selected selected [0.3398173 0.6601827] selected
scientific engi : selected selected [0.34105422 0.65894578] selected
existing datase : selected selected [0.3394509 0.6605491] selected
article polynom : selected selected [0.33712857 0.66287143] selected
forward stagewi : selected selected [0.33782848 0.66217152] selected
nonparametric r : selected selected [0.33975248 0.66024752] selected
background aim  : selected selected [0.33836106 0.66163894] selected
measure access  : selected selected [0.34062758 0.65937242] rejected
federal local i : rejected selected [0.34309255 0.65690745] selected
objective study : selected selected [0.3379472 0.6620528] selected
intervention be : selected selected [0.33854859 0.66145141] selected
question multif : selected selected [0.33790645 0.66209355] selected
ehealth field c : selected selected [0.33778992 0.66221008] selected
patient-centere : selected selected [0.3395321 0.6604679] rejected
backgroundrecen : selected selected [0.339

# look at entity extraction from the doc

In [59]:
from spacy import displacy

In [60]:
doc

At least four workers at a metal recycling facility were hospitalized and one died after exposure to chlorine gas when it was accidentally released from an intact, closed-valved cylinder being processed for scrap metal. This unintentional chlorine gas release marks at least the third such incident at a metal recycling facility in the United States since 2010. We describe the fatal case of the worker whose clinical course was consistent with acute respiratory distress syndrome (ARDS) following exposure to high concentrations of chlorine gas. This case report emphasizes the potential risk of chlorine gas exposure to metal recycling workers by accepting and processing intact, closed-valved containers. The metal recycling industry should take steps to increase awareness of this established risk to prevent future chlorine gas releases. Additionally, public health practitioners and clinicians should be aware that metal recycling workers are at risk for chlorine gas exposure.

In [61]:
spacy.displacy.render(doc, style='ent',jupyter=True)

In [62]:
from collections import Counter

In [63]:
rejected = trainEntityDF.loc[trainEntityDF['label'] == 'rejected']
selected = trainEntityDF.loc[trainEntityDF['label'] == 'selected']

In [64]:
len(rejected), len(selected)

(356, 715)

In [65]:
all_selected_texts = ""
for text in selected["text"]:
    all_selected_texts = all_selected_texts + text

In [66]:
counts = Counter(all_selected_texts.split())

In [67]:
counts.most_common(20)

[('health', 892),
 ('study', 772),
 ('datum', 675),
 ('model', 551),
 ('method', 401),
 ('risk', 392),
 ('analysis', 337),
 ('patient', 336),
 ('exposure', 323),
 ('child', 323),
 ('group', 316),
 ('intervention', 311),
 ('cancer', 299),
 ('increase', 291),
 ('with', 280),
 ('worker', 272),
 ('associate', 269),
 ('factor', 267),
 ('treatment', 257),
 ('program', 250)]

In [68]:
all_rejected_texts = ""
for text in rejected["text"]:
    all_rejected_texts = all_rejected_texts + text

In [69]:
counts = Counter(all_rejected_texts.split())

In [70]:
counts.most_common(20)

[('health', 618),
 ('study', 356),
 ('care', 234),
 ('program', 232),
 ('patient', 225),
 ('public', 187),
 ('datum', 170),
 ('research', 139),
 ('group', 138),
 ('intervention', 138),
 ('model', 135),
 ('increase', 130),
 ('training', 126),
 ('participant', 125),
 ('education', 124),
 ('disease', 118),
 ('method', 117),
 ('factor', 117),
 ('of', 110),
 ('country', 109)]

In [71]:
selected_words_list = all_selected_texts.split()
rejected_words_list = all_rejected_texts.split()

In [72]:
for word in rejected_words_list:
    if word not in selected_words_list:
        print(word)

barnhart
copula-based
clayton-oake
bootstrapp
mount
sinai
msswow
fertilitysense
conversation
drawback
contentment
participationquantify
imperfect
imperfect
perfect
secretion
non-obese
womanepidemiologic
exposure-failure-time
outcome-dependent
ods
od
infinite-dimensional
communities
aric
illustrationbenefit
need-supportive
need-supportive
person-centred
frontline
aged-care
self-determined
self-pacing
self-determination
sdt
goal-setting
pedometer
activityvirtual
practitioners
antibiotics-a
immersive
videoed
3rd
culmination
philosophy
instigate
antimicrobial
demonstrable
changeus
pre-approval
manufacturer
reluctant
pre-approval
hcei
fda
guardrail
conversation
decision-makertuberculosis
wildlife
ztb
mycobacterium
bovis
ztb
multi-institutional
ztb
ztb
ztb
roadmapstudy
pn
defect
pn
paternalistic
partystudy
php
correlational
phps
php
php
phpphthalate
disruptor
disrupting
metabolome
liquid
chromatography
quadrupole
q-value
nicotinamide
mononucleotide
cysteine
cystine
l-aspartic
amino-acid
meva

trustusa
bill
senate
non-discrimination
preserving
pewpa
benign
internationally
horizonantibiotic
antimicrobial
ams
hospital-based
troubling
sulfamethoxazole/trimethoprim
antibiogram
gaze
ams
ams
infections
ams
ams
ams
ams
committees
deliberation
ams
infections
prescriber
ams
ams
remotemobile
radiofrequency-electromagnetic
rf-emr
rf-emr
cellular
physicobiological
rf-emr
mouse
spermatogonial
gc1
spermatocyte
gc2
cauda
epididymal
spermatozoon
waveguide
rf-emr
ghz
mitochondrial
ros
gc1
gc2
etc
ro
ros
penicillamine
peroxidation
4-hydroxynonenal
ros
overt
cellular
rf-emr
fragmentation
spermatozoon
comet
fragmentation
8-hydroxy-2'-deoxyguanosine
spermatozoon
rf-emr
sperm
motility
rf-emr
mitochondrial
etc
radiationlobar
welder
welder
webinar
welder
polysaccharide
ppv23
welder
welder
welder
welder
globallynepal
fchvs
fchv
commodity
fchvs
fchv
periphery-level
fchv
interventionhealth
profession
accountable
affiliation
thenet
accountable
loop
address
measurementarticle
impose
nonzero
impressive
p

danang
hai
chau
danang
living
suicidal
motheraustralia
mutually-rated
counsellor
australasian
australasia
reformpublic
radically
teamwork
instructional
credit
masters
healthcompare
non-prediabetic
prediabetic
waist-to-hip
triglyceride
prediabetic
non-prediabetic
essentialdiphtheria-tetanus-pertussis
opv
guinea-bissau
opv
suburb
bandim
guinea-bissau
tri-monthly
weigh
weigh
dtp-vaccinated
dtp-unvaccinated
meta-estimate
receive
weight-for-age
waz
waz
dtp-vaccinated
waz
opv
guinea-bissau
dtp-vaccinated
dtp-unvaccinated
dtp-vaccinated
dtp-unvaccinated
mortalitypublic
twentieth
today
flat
changing
transformational
present-day
phim
phim
cross-collaboration
buy-in
human-centered
creativity
prototyping
outcomepsychosocial
ami
ami
ami
ami
post-ami
psychologic
physiologic
interventionpain
hospitalized
attitud
october
kasrp
wilcoxon
signed-rank
baccalaureate
nursegoal
non-nested
settinghematology-related
sickle
scd
hematology
hemato-oncology
hematology
hematology
hemato-oncology
hematology
hematol

enzymatic
monooxygenas
esteras
polymerase
knockdown
gambiae
pyrethroid
ddt
bendiocarb
iguhu
kabula
malathion
monooxygenas
esteras
enzymatic
lambdacyhalothrin
permethrin
deltamethrin
ddt
l1014s
kdr
anopheles
arabiensis
gambiae
pyrethroid
ddt
climatic
agroeconomic
insecticide
vectorintroduction
spreading
program-the
fwb
program-to
fwb
situational
theory-method
poststructural
fwb
arenas
fwb
constraining
interwoven
arena
demand-driven
continually
evidenceunited
overdose
undiagnosed
hcv
mik
computer-based
hiv/hcv
street
mik
bronx
street
syringe
exchange
mik
hcv
naloxone
overdose
hcv
iteration
mik
aud
aud
out-patient
in-patient
aud
departmental
aud
in-patient
aud
aud
medicavsupportive
detoxification
alcoholics
medical/supportive
alcohol-related
pregnancy-related
aud
in-patient
aud
aud
medical/supportive
awarenessevolution
oxytocin
vasopressin
myelinated
nerve
ventral
co-regulate
prosociality
compassion-based
safeness
compassion-based
affiliative
physiologically
vagus
nerve
well-beingoverfat


dearth
post-war
bladder
quit
quit
cancerolder
tai
escalate
implementationmountain
gorillas
gorilla
beringei
beringei
impenetrable
destruction
gorillas
cryptosporidium
giardia
sps
gorilla
human/gorilla/livestock
non-habituated
gorillas
community-owned
herd
cryptosporidium
giardia
sps
immunostat
dfa
giardia
gorillas
cryptosporidium
gorillas
cryptosporidium
giardia
sps
gorillas
gorillas
virungas
giardia
sp
gorillas
gorillas
vhct
giardia
gorilla
cryptosporidium
sp
gorillas
vhct
gorillas
trough
human/gorilla/livestock
interfaceantimicrobial
antimicrobial
antimicrobial
antimicrobial
tender
rfp
pharmacist-led
antimicrobial
antimicrobial
antimicrobial
antimicrobial
resistancewho
bog
ns
non-targeted
ns
opv
opv
ns
guinea-bissau
opv
opv
vas
vas-only
h1n1
before-campaign
after-campaign
opv-only
campaign-opv
bissau
campaign-opv
ns
opv-campaigns
negativelystudy
cdq-29
saliva
cdq-12
cdq-29
cdq
cdq-12
chinese-language
programstudy
chinese-versions
dying
behaviors
patients
behaviors
behaviors
kaiser-me

# look at random forrest

In [73]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [74]:
X = transformed # numeric array
y = list(df['times_cited'].fillna(0)) # set y to 1 or zero depending on response class
np.shape(X), np.shape(y)
model = RandomForestRegressor()

model.fit(X,y)
feat_importances = pd.Series(model.feature_importances_, index=['topic {}'.format(i) for i in range(np.shape(X)[1])])

NameError: name 'transformed' is not defined

In [None]:
feat_importances_40 = feat_importances.nlargest(10)
plt.figure(figsize=(10,7))
feat_importances_40.plot(kind='barh', color = 'orange')
plt.title('Topics most likely to  affect whether or not cited')
plt.xscale('log')
plt.savefig("featureimportance.png", bbox_inches='tight')
plt.show()
%matplotlib inline

# Let's look at creating a new training model from the clean data from Kasia from 2019-04-01

- look at a training set with 500 rejects / 500 acceptes
- look at a training set with 500 rejects / as many acceptes as we can 
- compare the precision on both 
- see if it is strongly correlated to the ratio of accept / reject papers  
- if not, think about proceeding  

In [78]:
import random

In [76]:
len(unique_rejected_clean_training), len(unique_accepted_clean_training_abstracts)

(590, 1179)

In [79]:
random.shuffle(unique_accepted_clean_training_abstracts)

In [138]:
unique_accepted_clean_training_abstracts_short = unique_accepted_clean_training_abstracts[0:590]

In [139]:
len(unique_accepted_clean_training_abstracts_short)

590

In [140]:
# Combine our data into two lists, one with abstracts, and the other with the labels. 
labels_clean_training = []
abstracts_clean_training = [] 
for abstract in unique_accepted_clean_training_abstracts_short:
    abstracts_clean_training.append(abstract)
    labels_clean_training.append("selected")
for abstract in unique_rejected_clean_training:
    abstracts_clean_training.append(abstract)
    labels_clean_training.append("rejected")

In [141]:
# create the data frame
trainCleanBalancedDF = pandas.DataFrame()
trainCleanBalancedDF['text'] = abstracts_clean_training
trainCleanBalancedDF['label'] = labels_clean_training

In [142]:
trainCleanBalancedDF[0:3]

Unnamed: 0,text,label
0,Objective: To identify programmes involving th...,selected
1,Background: Return-to-play protocols describe ...,selected
2,Objectives: We investigated factors associated...,selected


In [143]:
# now we do the model creation

In [144]:
# Features and Labels
X = trainCleanBalancedDF['text']
ylabels = trainCleanBalancedDF['label']
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.25, random_state=42)

In [145]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

# show the core model

In [146]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()
# classifier = SVC(probability=True, class_weight="balanced",C=0.1, gamma= 0.01)
# classifier = SVC(probability=True, 
#                  class_weight="balanced",
#                  C=0.01, 
#                  gamma= 0.1
#                 )
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [147]:
# for out clearner, vectorize and classifier defined earlier, we are using the full abstract texxt here. 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier)])

In [148]:
# fit the data
# Fit our data
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x1c66e2e3c8>), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ng...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [149]:
# get predictions
y_pred = pipe.predict(X_test)

In [128]:
# get pred_proba as we are using 
# classifier = SVC(probability=True, class_weight="balanced",C=0.1, gamma= 0.01) and not Linear SVC 
ye_pred_prob = pipe.predict_proba(X_test)

# Show how poor the model preforms 

In [150]:
# look at the preformacne of this model
metrics.confusion_matrix(y_pred, y_test), metrics.accuracy_score(y_test, y_pred)

(array([[83, 53],
        [65, 94]]), 0.6)