In [1]:
## Copyright 2020 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

### Set up the environment

In [18]:
import nltk
nltk.download('stopwords')

Import the packages.

In [3]:
import os,sys
import pandas as pd
from utils.applier_utils import get_metadata
from utils.common_utils import get_docid
from applier import Applier



Download the pre-trained models. This may take a few minutes.

In [6]:
!wget https://wntrac-models-public.s3.us.cloud-object-storage.appdomain.cloud/wntrac_models.zip
!unzip wntrac_models.zip

--2020-09-24 13:51:10--  https://wntrac-models-public.s3.us.cloud-object-storage.appdomain.cloud/wntrac_models.zip
Resolving wntrac-models-public.s3.us.cloud-object-storage.appdomain.cloud (wntrac-models-public.s3.us.cloud-object-storage.appdomain.cloud)... 67.228.254.196
Connecting to wntrac-models-public.s3.us.cloud-object-storage.appdomain.cloud (wntrac-models-public.s3.us.cloud-object-storage.appdomain.cloud)|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 406394450 (388M) [application/zip]
Saving to: ‘wntrac_models.zip’


2020-09-24 13:55:25 (1.53 MB/s) - ‘wntrac_models.zip’ saved [406394450/406394450]

Archive:  wntrac_models.zip
   creating: wntrac_models/
  inflating: wntrac_models/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/wntrac_models/
  inflating: __MACOSX/wntrac_models/._.DS_Store  
  inflating: wntrac_models/lr.sav    
   creating: wntrac_models/bert_model/
  inflating: wntrac_models/bert_model/training_args.bin.bkp  

### Execute the NLP pipeline
Set the paths to the input directory, models, and resources. Enable CUDA if you have GPUs but it is not necessary for running the pipeline.

In [4]:
dir_path = 'demo_data'
model_path = 'wntrac_models'
resources_path = 'resources'
use_cuda = False

Create an instance of the Applier which initializes models used in the max-voting ensemble (BERT, Support Vector and Logistic Regression)

In [5]:
applier = Applier(dir_path, model_path, resources_path, use_cuda)

Loaded Spacy models.
Initialized BERT model
Initialized SVM model.
Initialized LR model.




Load an example preprocessed input file. This file is extracted using [MediaWikiAPI](https://www.mediawiki.org/wiki/Wikimedia_REST_API) from the corresponding 'doc_url', and segmented using [SpaCy](https://github.com/explosion/spacy-models/releases//tag/en_core_web_lg-2.2.0).

In [9]:
example_file = 'demo_data/preprocessed_doc_COVID-19_pandemic_in_New_York_(state).csv'

In [10]:
example_file_df = pd.read_csv(example_file)
example_file_df.head()

Unnamed: 0,sent_id,text,begin_offset,end_offset,citation_url,date,country,doc_url
0,aea72c78-d183-11ea-a597-b9c421b787e1_1,The first case of COVID-19 in the U.S. state o...,0,219,https://www.wsj.com/articles/first-case-of-cor...,2020-03-01,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...
1,aea72c78-d183-11ea-a597-b9c421b787e1_2,"By April 10, New York had more confirmed cases...",220,375,https://www.cnbc.com/2020/04/10/new-york-state...,2020-04-10,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...
2,aea72c78-d183-11ea-a597-b9c421b787e1_3,"As of , there have been 5.7 million tests, 412...",376,475,,2020-04-10,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...
3,aea72c78-d183-11ea-a597-b9c421b787e1_4,New York had the highest number of confirmed c...,475,634,https://www.france24.com/en/20200722-californi...,2020-07-22,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...
4,aea72c78-d183-11ea-a597-b9c421b787e1_5,"In May 2020, nearly one-fourth of known U.S. c...",635,740,,2020-05-29,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...


Applying the model on the example input:

In [11]:
example_file_df = applier._predict_type(example_file_df)

HBox(children=(FloatProgress(value=0.0, max=511.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))




  probs.append((torch.nn.functional.softmax(Variable(torch.from_numpy(raw_outputs[i])))).numpy())


Event Type Prediction complete, starting fine-grained event value prediction


The output below shows the predictions of each individual model in prediction_[MODEL_NAME], as well as it's final (ensembled) event-type in 'type'. The computed event value is represented in 'value' field.

In [12]:
example_file_df.head()

Unnamed: 0,sent_id,text,begin_offset,end_offset,citation_url,date,country,doc_url,prediction,prediction_BERT,...,prediction_LINEAR_SVM_ESTIMATOR,conf_LINEAR_SVM_ESTIMATOR,prediction_LOGISTIC_REGRESSION,conf_LOGISTIC_REGRESSION,type,level_of_confidence,value,wikified,level_of_enforcement,restriction
0,aea72c78-d183-11ea-a597-b9c421b787e1_1,The first case of COVID-19 in the U.S. state o...,0,219,https://www.wsj.com/articles/first-case-of-cor...,2020-03-01,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...,8,unk,...,unk,0.516,unk,0.984,unk,0.759411,,U.S.|New York,,
1,aea72c78-d183-11ea-a597-b9c421b787e1_2,"By April 10, New York had more confirmed cases...",220,375,https://www.cnbc.com/2020/04/10/new-york-state...,2020-04-10,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...,8,unk,...,unk,0.503,unk,0.997,unk,0.726589,,New York,,
2,aea72c78-d183-11ea-a597-b9c421b787e1_3,"As of , there have been 5.7 million tests, 412...",376,475,,2020-04-10,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...,8,unk,...,unk,0.517,unk,0.746,unk,0.735538,,,,
3,aea72c78-d183-11ea-a597-b9c421b787e1_4,New York had the highest number of confirmed c...,475,634,https://www.france24.com/en/20200722-californi...,2020-07-22,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...,8,unk,...,unk,0.305,misc,0.586,unk,0.564722,,New York|U.S.|California|Florida,,
4,aea72c78-d183-11ea-a597-b9c421b787e1_5,"In May 2020, nearly one-fourth of known U.S. c...",635,740,,2020-05-29,USA-NY,https://en.wikipedia.org/w/index.php?title=COV...,8,unk,...,unk,0.301,unk,0.94,unk,0.691224,,U.S.|New York,,


Produce the WNTRAC data record: set the crawl_id, crawl_date, unique id for each NPI evidence.

In [13]:
doc_url = example_file_df['doc_url'].iloc[0]
doc_id = get_docid(doc_url)
outpath = os.path.join(applier.dir_path, 'nlp_doc_' + str(doc_id) + ".csv")
crawl_id, crawl_date = get_metadata(applier.dir_path)
example_file_df['crawl_id']=crawl_id
example_file_df['crawl_date']=crawl_date
example_file_df['evid_id']=example_file_df.apply(lambda row:applier._generate_evid_id(), axis=1)
example_file_df = applier._remove_other_types(example_file_df)
example_file_df = example_file_df.rename(columns={'wikified':'fine_grained_location'})
headers = ['evid_id','sent_id','doc_url', 'crawl_id', 'crawl_date', 'text', 'begin_offset', 'end_offset', 'citation_url', 'type', 'country','date', 'value', 'level_of_confidence', 'level_of_enforcement', 'restriction', 'fine_grained_location']

Write the output file.

In [14]:
example_file_df.to_csv(outpath, index=False, encoding='utf-8', columns=headers)
print ("Dumping output in ", outpath)

Dumping output in  demo_data/nlp_doc_COVID-19_pandemic_in_New_York_(state).csv


#### An example snippet of a cleaned-up output file:

In [15]:
output = example_file_df[headers]

In [16]:
output[output['type']!='unk'].head()

Unnamed: 0,evid_id,sent_id,doc_url,crawl_id,crawl_date,text,begin_offset,end_offset,citation_url,type,country,date,value,level_of_confidence,level_of_enforcement,restriction,fine_grained_location
12,ba56fe49-80fe-4cc3-a036-b94d62bb03cc,aea72c78-d183-11ea-a597-b9c421b787e1_13,https://en.wikipedia.org/w/index.php?title=COV...,2020-07-29_eba153f2-d182-11ea-ae45-0b95ea282be6,2020-07-29,Americans visiting Italy in late February and ...,1636,1864,,misc,USA-NY,2020-03-01,,0.701336,,,Americans|Italy|New York|Italy|The State Depar...
17,f3c951b0-020a-48ce-9a97-717738789549,aea72c78-d183-11ea-a597-b9c421b787e1_18,https://en.wikipedia.org/w/index.php?title=COV...,2020-07-29_eba153f2-d182-11ea-ae45-0b95ea282be6,2020-07-29,She went into home isolation with her husband.,2458,2506,https://edition.cnn.com/2020/03/02/us/new-york...,confinement,USA-NY,2020-03-01,,0.827619,,,
19,8cc07ab6-b6c6-4bf7-b84b-a279d1b3fc6f,aea72c78-d183-11ea-a597-b9c421b787e1_20,https://en.wikipedia.org/w/index.php?title=COV...,2020-07-29_eba153f2-d182-11ea-ae45-0b95ea282be6,2020-07-29,"He had traveled to Miami in February, but had ...",2728,2849,,misc,USA-NY,2020-02-29,,0.672912,,,Miami
29,5c69f6a7-a50e-4ed9-8e67-955b107d6329,aea72c78-d183-11ea-a597-b9c421b787e1_30,https://en.wikipedia.org/w/index.php?title=COV...,2020-07-29_eba153f2-d182-11ea-ae45-0b95ea282be6,2020-07-29,"On March 7, Governor Andrew Cuomo declared a s...",4238,4435,https://www.nytimes.com/2020/03/07/nyregion/co...,misc,USA-NY,2020-03-07,,0.633483,,,New York|Westchester County|New York City
31,429181c6-62be-4813-bb6b-a2d0c2886d45,aea72c78-d183-11ea-a597-b9c421b787e1_32,https://en.wikipedia.org/w/index.php?title=COV...,2020-07-29_eba153f2-d182-11ea-ae45-0b95ea282be6,2020-07-29,New York City issued new commuter guidelines a...,4525,4722,https://www.nbcnewyork.com/news/local/nyc-issu...,misc,USA-NY,2020-03-08,,0.671598,,,New York City
