In [1]:
import nodule_parser
import nodule_learner
import nodule_extractor
import imp
imp.reload(nodule_parser)
imp.reload(nodule_learner)
imp.reload(nodule_extractor)
import pandas as pd
import json

import re
import numpy as np

nl = nodule_learner.NoduleLearner(cache_size=500)

## NLP Nodule Learner

### Description
This repository contains a supervised classification model for predicting whether a patient has a pulmonary nodule based on the doctor's comments in a report about the patient in question. For patients predicted to have a pulmonary nodule, the size and location of that nodule will also be extracted.

### Usage

To use, first create an instance of the `NoduleLearner` class, then call the `transform_predict` instance method on a file containing the doctor's report from which to make the prediction, as follows:

```python
nl = NoduleLearner()
prediction = nl.transform_predict(file_name)
```

The `transform_predict()` method by default doesn't actually make predictions, but rather transforms the data and caches it, later making the actual predictions once the `cache_size` is reached (default size is 500, but this can be set when instantiating the `NoduleLearner` class). If the learner finishes looping through the files and there are still some left to predict, simply call `nl.dump_predictions()` (you'll stil need to set the `probability` and `vetting` arguments) to make the predictions on the rest of the records left in the cache.

There are three arguments to `transform_predict()`, the latter two of which default to `False`:

* `file` (`str`): the file on which to perform the predictions
* `probability` (`bool`): whether or not to output predictions for the nodule classifications
* `vetting` (`bool`): whether to output data needed to vet the extractions (more on that below)

When not vetting the extractions (`vetting=False`), the model outputs the following data for each record:

* `directory (str)`: the folder from which the file came (can be useful for traceability)
* `filename (str)`: the filename of the file, minus the extension
* `max_nodule_change (float)`: if there is a previous record for the patient which also contains a nodule size, this is the difference in size of the largest nodule seen in the patient record
* `max_nodule_location (str)`: the location of the largest nodule extracted; one of left upper lobe, lingula, left lower lobe, right upper lobe, right middle lobe, right lower lobe, or '' (indicating no location was found)
* `max_nodule_lung (str)`: the lung in which the largest extracted nodule was located; one of left, right, or ''
* `max_nodule_size (str)`: the size of the largest nodule, in millimeters
* `evidence (str)`: the phrases extracted that contained terms indicating the possible presence of nodules, from which the largest nodule was extracted, if there was one
* `pid (str)`: the patient ID for the record in question
* `prediction (int)`: the prediction for whether or not the report indicates the presence of a pulmonary nodule; either 0 (negative) or 1 (positive)
* `probability (float)`: (if `probability=True`) the model's confidence in the prediction, on 0 (least confident) to 1 (most confident) scale
* `report_date`: the date of the report in question (used for tracking the growth of the largest nodule)
* `prev_max_date`: the date of the previous largest nodule (used for tracking the growth of the largest nodule)

If `vetting` is set to `True`, the following fields will also be present in the output:

* `is_nodule (int)`: whether or not the phrase in question contains evidence of a pulmonary nodule, 0 for negative, 1 for positive
* `truth_marking (int)` : whether or not the record in question was vetted to contain evidence of a pulmonary nodule, 0 for negatively labeled, 1 for positively labeled, all records without truth markings are set to -1
* `nodule_location (str)`: the extracted location of the nodule in the phrase in question (same parameters as for `max_nodule_location` above)
* `nodule_lung (str)`: the extracted lung of the nodule in the phrase in question
* `nodule_size (str)`: the extracted size of the nodule, in string form (i.e. "5 mm", etc.)
* `nodule_size_numeric (float)`: the extracted size of the nodule, in float form (same as for `max_nodule_size` above)
* `phrase_counter (int)`: an integer value to differentiate the phrases pulled out of a record that contain a term for a nodule

Each of the values above, minus the `phrase_counter`, must be vetted in order to measure and improve the performance of the nodule extractor (the part that identifies size, location, and lung of the largest nodule).

The output will be in the form of a JSON array.

Note: these are output in an array since some files contain more than one patient record within the file.

In [2]:
import traceback
import warnings
import sys

def warn_with_traceback(message, category, filename, lineno, file=None, line=None):

    log = file if hasattr(file,'write') else sys.stderr
    traceback.print_stack(file=log)
    log.write(warnings.formatwarning(message, category, filename, lineno, line))

warnings.showwarning = warn_with_traceback

In [3]:
from datetime import datetime
import os

# top = "../data-20180319/"

top = "../data-20180629/"

predictions = []
counter = 0

start_time = datetime.now()
# for folder in os.listdir(top)[2:4]:
for folder in os.listdir(top):
    print("reading in data from %s" % folder)
    for file in sorted(os.listdir(top + folder)):
        predictions += json.loads(
            nl.transform_predict(
                file=top + folder + "/" + file,
                probability=True,
                vetting=True))
        if counter >= 1000:
            break
        counter += 1   
        
predictions_df = pd.DataFrame(predictions)
print(len(predictions_df))
print((datetime.now() - start_time).seconds)
predictions_df.head()

reading in data from phHealth
reading in data from SCL_powerscribe
reading in data from Riaco
reading in data from MUSC
reading in data from Community
reading in data from CCF
reading in data from HCA
reading in data from Haymed
reading in data from SCL
1221
19


Unnamed: 0,directory,evidence,filename,is_nodule,max_nodule_change,max_nodule_location,max_nodule_lung,max_nodule_size,nodule_location,nodule_lung,nodule_phrase,nodule_size,nodule_size_numeric,phrase_counter,pid,prediction,prev_max_date,probability,report_date,truth_marking
0,.._data-20180629_phHealth,,ORU_ex1,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.988,20171215,
1,.._data-20180629_phHealth,,ORU_ex2,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.988,20171130,
2,.._data-20180629_phHealth,,Q297438865T300837729_20171220193617056,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.975,20171130,
3,.._data-20180629_phHealth,,Q297438870T300837734_20171220193617155,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.988,20171130,
4,.._data-20180629_phHealth,,Q297439723T300838670_20171220193617254,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.998,20171201,


# count by dir of records and unique patiants with nodule size

In [5]:
predictions_df[predictions_df.nodule_size != ''].groupby(['directory'])\
    .agg({'nodule_size':'count',
         'pid': lambda x: len(list(set(x)) ) }  )

Unnamed: 0_level_0,nodule_size,pid
directory,Unnamed: 1_level_1,Unnamed: 2_level_1
.._data-20180629_phHealth,51,17


# count by dir of records and unique patiants with nodule location

In [6]:
predictions_df[predictions_df.nodule_location != ''].groupby(['directory'])\
    .agg({'nodule_location':'count',
         'pid': lambda x: len(list(set(x)) ) }  )

Unnamed: 0_level_0,nodule_location,pid
directory,Unnamed: 1_level_1,Unnamed: 2_level_1
.._data-20180629_phHealth,34,13


In [7]:
# breakdown of location counts
predictions_df[predictions_df.nodule_location != ''].groupby(['directory','nodule_location'])\
    .agg({'filename':'count',
         'pid': lambda x: len(list(set(x)) ) }  )

Unnamed: 0_level_0,Unnamed: 1_level_0,filename,pid
directory,nodule_location,Unnamed: 2_level_1,Unnamed: 3_level_1
.._data-20180629_phHealth,left lower lobe,10,6
.._data-20180629_phHealth,left upper lobe,14,6
.._data-20180629_phHealth,lingula,1,1
.._data-20180629_phHealth,right lower lobe,2,2
.._data-20180629_phHealth,right middle lobe,3,2
.._data-20180629_phHealth,right upper lobe,4,2


In [8]:
# number of 'Truth' rows
predictions_df[predictions_df.evidence !=""].shape

(317, 20)

In [9]:
# count of patiants where growth was recorded and details
growth_huh = predictions_df[predictions_df.nodule_size_numeric > 0.0].groupby('pid')\
    .agg( {'nodule_size_numeric': 'unique' ,
        'max_nodule_size': 'unique',
          'nodule_size': ['min', 'max'],
          } ).reset_index()
growth_huh['sizehuh'] =growth_huh.nodule_size_numeric.unique.apply(lambda x : len(set(x) ))
growth_huh = growth_huh[growth_huh['sizehuh'] >1]
growth_huh

Unnamed: 0_level_0,pid,nodule_size_numeric,max_nodule_size,nodule_size,nodule_size,sizehuh
Unnamed: 0_level_1,Unnamed: 1_level_1,unique,unique,min,max,Unnamed: 6_level_1
2,320561,"[13.0, 35.0]",[35.0],1.3 cm,3.5 cm,2
5,333507,"[3.0, 5.0]",[5.0],0.3 cm,0.5 cm,2
6,333800,"[40.0, 20.0, 10.0, 9.0, 19.0, 26.0]","[40.0, 0.0]",0.9 cm,4 cm,6
11,423447,"[6.0, 12.0, 5.0, 3.0]",[12.0],12 mm,6 mm,4
13,450397,"[5.0, 6.0]",[6.0],5 mm,6 mm,2
15,514715,"[17.0, 5.0, 4.0, 6.0]",[17.0],17 mm,6 mm,4
16,544895,"[23.0, 53.0]",[53.0],2.3 cm,5.3 cm,2


# number of patiants with growth

In [10]:

growth_huh.shape

(7, 6)

# calculate model accuracy and phrase accuracy
## records positively labeled
### truth_marking of 1 means there was a match on "qb_y" or "[lngnod]" or "afnrzga19b"
## records labeld as false
### truth_marking of 0 means there was a match on "qb_n"
### truth_marking of -1 for all other records with 'is_nodule', but no validation

In [28]:
from IPython.display import display, HTML
# basic counts
records_marked_true = len(predictions_df[predictions_df['truth_marking'] == 1])
records_marked_false = len(predictions_df[predictions_df['truth_marking'] == 0])
records_with_vetting = len(predictions_df[(predictions_df['truth_marking'] == 1) | (predictions_df['truth_marking'] == 0)]) 
records_with_phrases = len(predictions_df[predictions_df['is_nodule'] == 1]) 
perc_records_marked_true = round(records_marked_true/records_with_phrases,5) *100
perc_records_marked_false = round(records_marked_false/records_with_phrases,5) *100

# model eval from vetted
true_pos = len(predictions_df[(predictions_df['prediction']== 1) & ( predictions_df['truth_marking'] ==1)])
true_neg = len(predictions_df[(predictions_df['prediction']== 0) & ( predictions_df['truth_marking'] == 0)])
false_pos = len(predictions_df[(predictions_df['prediction']== 0) & ( predictions_df['truth_marking'] ==1)])
false_neg = len(predictions_df[(predictions_df['prediction']== 1) & ( predictions_df['truth_marking'] ==0)])

# model eval from phrases
true_pos_phrase = len(predictions_df[(predictions_df['prediction']== 1) & ( predictions_df['is_nodule'] ==1)])
false_pos_phrase = len(predictions_df[(predictions_df['prediction']== 0) & ( predictions_df['is_nodule'] == 1)])
# make basic truth table
truthiness_df = pd.DataFrame([
    {"Discriminator": "Vetted True Positives", "count":true_pos, "percent of vetted/phrases":round(true_pos/records_with_vetting,5) *100},
    {"Discriminator": "Vetted True Negative", "count":true_neg, "percent of vetted/phrases":round(true_neg/records_with_vetting,5) *100},
    {"Discriminator": "Vetted False Positives", "count":false_pos, "percent of vetted/phrases":round(false_pos/records_with_vetting,5) *100},
    {"Discriminator": "Vetted False Negative", "count":false_neg, "percent of vetted/phrases":round(false_neg/records_with_vetting,5) *100},
    {"Discriminator": "Phrase based True Positives", "count":true_pos_phrase, "percent of vetted/phrases":round(true_pos_phrase/records_with_phrases,5) *100},
    {"Discriminator": "Phrase based False Positives", "count":false_pos_phrase, "percent of vetted/phrases":round(false_pos_phrase/records_with_phrases,5) *100},
])
print("""! Information below based off of vetted data and extracted phrases ONLY !
%s (%s%% of phrases) records vetted as TRUE
%s (%s%% of phrases)records marked as FALSE
of %s records containing evidence of a pulmonary nodule
""" % (records_marked_true,
                                                                perc_records_marked_true,
                                                                records_marked_false,
                                                                perc_records_marked_false,
                                                                records_with_phrases) ) # is a nodule

display(truthiness_df)

! Information below based off of vetted data and extracted phrases ONLY !
19 (5.994% of phrases) records vetted as TRUE
0 (0.0% of phrases)records marked as FALSE
of 317 records containing evidence of a pulmonary nodule



Unnamed: 0,Discriminator,count,percent of vetted/phrases
0,Vetted True Positives,19,100.0
1,Vetted True Negative,0,0.0
2,Vetted False Positives,0,0.0
3,Vetted False Negative,0,0.0
4,Phrase based True Positives,177,55.836
5,Phrase based False Positives,140,44.164


# Classification Reports

In [52]:
from sklearn.metrics import classification_report
predictions_df.truth_marking = predictions_df[~pd.isnull(predictions_df.truth_marking) ]['truth_marking'].astype(int)
print("Classification Report based off of Vetted data")
original_report = classification_report(predictions_df[~pd.isnull(predictions_df.truth_marking) ]['prediction'], 
                        predictions_df[~pd.isnull(predictions_df.truth_marking) ]['truth_marking'] )
print(original_report)
print()
print("Classification Report based off of phrases")
original_report = classification_report(predictions_df['prediction'], predictions_df['is_nodule'])
print(original_report)

Classification Report based off of Vetted data
             precision    recall  f1-score   support

       -1.0       0.00      0.00      0.00         0
        0.0       0.00      0.00      0.00       140
        1.0       1.00      0.11      0.19       177

avg / total       0.56      0.06      0.11       317


Classification Report based off of phrases
             precision    recall  f1-score   support

          0       0.95      0.86      0.90      1002
          1       0.56      0.81      0.66       219

avg / total       0.88      0.85      0.86      1221



  File "/home/medintel/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/medintel/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/medintel/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/medintel/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/medintel/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/home/medintel/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/medintel/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/medintel/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 27

In [35]:
predictions_df.dtypes

directory               object
evidence                object
filename                object
is_nodule                int64
max_nodule_change      float64
max_nodule_location     object
max_nodule_lung         object
max_nodule_size        float64
nodule_location         object
nodule_lung             object
nodule_phrase           object
nodule_size             object
nodule_size_numeric    float64
phrase_counter           int64
pid                     object
prediction               int64
prev_max_date           object
probability            float64
report_date             object
truth_marking          float64
dtype: object

In [15]:
predictions_df['nodule_phrase'].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                              904
ct of the pelvis: no pelvic mass or lymphadenopathy is present                                                                                                                                                                                                                                                                                                                                                                 20
no pancreatic mass or inflammatory process is present                                                                                                               

In [16]:
predictions_df.query("prediction == 1")['max_nodule_size'].value_counts()

0.0     128
5.0      14
13.0     14
1.0      12
3.0      10
40.0      9
12.0      7
35.0      6
17.0      6
6.0       6
53.0      5
60.0      2
Name: max_nodule_size, dtype: int64

In [9]:
predictions_df.query("max_nodule_change != 0")# & prev_max_date != report_date")

Unnamed: 0,directory,evidence,filename,is_nodule,max_nodule_change,max_nodule_location,max_nodule_lung,max_nodule_size,nodule_location,nodule_lung,nodule_phrase,nodule_size,nodule_size_numeric,phrase_counter,pid,prediction,prev_max_date,probability,report_date


In [22]:
patient_data = pd.read_sql("SELECT * FROM patient_data", nl.conn)
patient_data

Unnamed: 0,index,directory,filename,max_nodule_size,pid,report_date
0,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114599,13.0,000333972,20180214
1,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114599,13.0,000333972,20180214
2,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114599,13.0,000333972,20180214
3,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114599,13.0,000333972,20180214
4,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114599,13.0,000333972,20180214
5,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114599,13.0,000333972,20180214
6,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114599,13.0,000333972,20180214
7,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114708,13.0,000333972,20180214
8,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114708,13.0,000333972,20180214
9,0,.._data-20180629_phHealth,Q334115231T337641482_20180214163114708,13.0,000333972,20180214


In [180]:
len(patient_data['pid'].unique())

24

In [24]:
fields = ['directory', 'filename', 'pid', 'max_nodule_size', 'max_nodule_location',
          'max_nodule_lung', 'report_date']

In [15]:
vetted_data = pd.read_csv("../20180326_data_vetted.csv").drop('_id', axis=1)
vetted_data.head()

Unnamed: 0,id,directory,text,label,predicted,probability,vetted_label
0,165484_20180308155433029,musc,hx of ovarian cancer SBO now with fever INDICA...,-1,1,0.960047,1.0
1,CP.ITS.180206.109_20180206152537624,community,Community Hospital Imaging Center 2351 G Road ...,-1,1,0.99,1.0
2,CP.ITS.180308.425_20180308202001780,community,Community Hospital Imaging Center 2351 G Road ...,-1,1,0.68,0.0
3,163855_20180307185325131,musc,Evaluate disease INDICATION Evaluate disease G...,-1,1,1.0,1.0
4,170932_20180312155901258,musc,as above pulm nodules history of nodules INDIC...,-1,1,0.612092,0.0


In [13]:
predictions_df.head()

Unnamed: 0,directory,evidence,filename,is_nodule,max_nodule_change,max_nodule_location,max_nodule_lung,max_nodule_size,nodule_location,nodule_lung,nodule_phrase,nodule_size,nodule_size_numeric,phrase_counter,pid,prediction,prev_max_date,probability,report_date
0,.._data-20180629_phHealth,,ORU_ex1,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.988,20171215
1,.._data-20180629_phHealth,,ORU_ex2,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.988,20171130
2,.._data-20180629_phHealth,,Q297438865T300837729_20171220193617056,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.975,20171130
3,.._data-20180629_phHealth,,Q297438870T300837734_20171220193617155,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.988,20171130
4,.._data-20180629_phHealth,,Q297439723T300838670_20171220193617254,0,0.0,,,0.0,,,,,0.0,0,922907,0,,0.998,20171201


## Get Previously-Vetted Data

In [17]:
prev_vetted_labels.head()

NameError: name 'prev_vetted_labels' is not defined

In [14]:
prev_vetted_labels = pd.read_csv("../vetted_labels.csv").drop('Unnamed: 0', axis=1)

subdirs = ["ParsedORUFiles", "Haymed", "powerscribe", "SCL"]

prev_vetted_records = []
for directory in [x.replace("(new)","-new") for x in prev_vetted_labels['directory'].unique()]:
    files_to_parse = prev_vetted_labels.query("directory == '{}'".format(directory))['id'].values
    subdirs = {'CCF': 'ParsedORUFiles', 'SCL': 'powerscribe', 'SCL-new': 'SCL',
              'Haymed': 'ParsedORUFiles', 'Haymed-new': 'Haymed'}
    joined_dir = (directory + "/" + subdirs.get(directory, "") + "/").replace("//", "/")
    for file in files_to_parse:
        full_path = "../" + joined_dir + file + ".txt"
        try:
            record = parser.extract_text(full_path)
            for row in record:
                row.update({'directory': directory.replace("-new", "(new)")})
                prev_vetted_records.append(row)
        except:
            continue
            
prev_vetted_records_df = pd.DataFrame(prev_vetted_records)  

prev_vetted_records_df.head()

In [20]:
prev_vetted = prev_vetted_labels.merge(prev_vetted_records_df, left_on=['id', 'directory'],
                                       right_on=['filename', 'directory'])

KeyError: 'filename'

In [21]:
prev_vetted[(prev_vetted.text.str.contains(r"(no pulmonary nodules)|(no suspicious pulmonary nodules)",
                                          flags=re.IGNORECASE))]['joined_label'].value_counts()

NameError: name 'prev_vetted' is not defined