In [1]:
import pandas as pd
data = pd.read_excel("~/Downloads/deidentified_radiology_findings.xlsx")
# clean up text
data.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)

eval_data = data[pd.notnull(data["cxr_abnormal"])]
# Remove any unlabeled data with no findings text
unlabeled_data = data.query("cxr_abnormal.isnull() & findings.notnull()")

In [2]:
print(len(eval_data))
print(len(unlabeled_data))

124
1760


In [3]:
eval_df = pd.DataFrame({
    "note_num": eval_data["note_num"],
    "label": [int(val =="Yes") for val in eval_data["cxr_abnormal"]],
    "text": eval_data["findings"]
})

In [4]:
eval_df.query('text.str.contains("infiltrate")')

Unnamed: 0,note_num,label,text
77,78,1,frontal portable view reveals a small area of ...


In [5]:
train_df = pd.DataFrame({
    "note_num": unlabeled_data["note_num"],
    "label": -1,
    "text": unlabeled_data["findings"],
    "lung_startpos": unlabeled_data["findings"].str.find("lungs:"),
    "pleural1_startpos": unlabeled_data["findings"].str.find("pleura:"),
    "pleural2_startpos": unlabeled_data["findings"].str.find("pleura/pleural space:"),
    "pleural3_startpos": unlabeled_data["findings"].str.find("pleura and pleural surfaces:"),
    "pleural4_startpos": unlabeled_data["findings"].str.find("lungs/pleural space:"),
    "pleural5_startpos": unlabeled_data["findings"].str.find("lungs/pleura/pleural space:"),
    "pleural6_startpos": unlabeled_data["findings"].str.find("lungs/pleura:"),
    "heart1_startpos": unlabeled_data["findings"].str.find("heart:"),
    "heart2_startpos": unlabeled_data["findings"].str.find("heart and mediastinum:"),
    "airway1_startpos": unlabeled_data["findings"].str.find("central airways:"),
    "thorax1_startpos": unlabeled_data["findings"].str.find("bones and soft tissues of the thorax:"),
    "thorax2_startpos": unlabeled_data["findings"].str.find("bones of the thorax:"),
    "bones1_startpos": unlabeled_data["findings"].str.find("bones and soft tissues:")
})

In [6]:
train_df[train_df["note_num"] == 743]

Unnamed: 0,note_num,label,text,lung_startpos,pleural1_startpos,pleural2_startpos,pleural3_startpos,pleural4_startpos,pleural5_startpos,pleural6_startpos,heart1_startpos,heart2_startpos,airway1_startpos,thorax1_startpos,thorax2_startpos,bones1_startpos
742,743,-1,tubes and catheters: none seen. central airway...,158,-1,-1,284,-1,-1,-1,-1,63,32,334,-1,-1


In [7]:
def parse_lung_text(row):
    row_data = row.to_dict()
    start_pos = row_data["lung_startpos"]
    if start_pos > -1:
        pos = [val for key, val in row_data.items() if key.endswith("_startpos") and val > start_pos]
        #if len(pos) == 0:
        #    print(row_data)
        end_pos = int(min(pos))
        start_pos = int(start_pos)
        return row.text[start_pos:end_pos]
    else:
        return ""

def parse_pleura_text(row):
    row_data = row.to_dict()
    pos = [val for key, val in row_data.items() if key.startswith("pleural") and val > -1]
    if len(pos) > 0:
        start_pos = int(min(pos))
        end_pos = len(row.text)
        pos = [val for key, val in row_data.items() if key.endswith("_startpos") and not key.startswith("pleural") and val > start_pos]
        if len(pos) > 0:
            end_pos = int(min(pos))
        #else:
        #    print(row_data)
        return row.text[start_pos:end_pos]
    else:
        return ""

def parse_lungpleura_text(row):
    if row.lungpleura_startpos > -1:
        return row.text[int(row.lungpleura_startpos):int(row.heart_startpos)]
    else:
        return ""

train_df["lung_text"] = train_df.apply(parse_lung_text, axis = 1)
train_df["pleura_text"] = train_df.apply(parse_pleura_text, axis = 1)
#train_df["lungpleura_text"] = train_df.apply(parse_lungpleura_text, axis = 1)

In [8]:
train_df

Unnamed: 0,note_num,label,text,lung_startpos,pleural1_startpos,pleural2_startpos,pleural3_startpos,pleural4_startpos,pleural5_startpos,pleural6_startpos,heart1_startpos,heart2_startpos,airway1_startpos,thorax1_startpos,thorax2_startpos,bones1_startpos,lung_text,pleura_text
124,125,-1,tubes and catheters: none. central airways: no...,-1,-1,58,-1,-1,52,-1,-1,285,27,-1,-1,-1,,lungs/pleura/pleural space: interstitial promi...
125,126,-1,tubes and catheters: none. central airways: no...,52,-1,159,-1,-1,-1,-1,-1,189,27,-1,-1,-1,lungs: bilateral lower lobe infiltrates on thi...,pleura/pleural space: normal.
126,127,-1,tubes and catheters: none. lungs/pleura/pleura...,-1,-1,33,-1,-1,27,-1,-1,97,-1,-1,-1,-1,,lungs/pleura/pleural space: normal lung volume...
127,128,-1,tubes and catheters: none. central airways: no...,-1,58,-1,-1,-1,-1,52,-1,162,27,-1,-1,-1,,lungs/pleura: bibasilar patchy airspace opacit...
128,129,-1,tubes and catheters: none. central airways: no...,-1,-1,58,-1,-1,52,-1,-1,139,27,-1,-1,-1,,lungs/pleura/pleural space: increasing airspac...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,1890,-1,tubes and catheters: none. central airways: no...,52,-1,160,-1,-1,-1,-1,-1,190,27,-1,-1,-1,lungs: hypoaeration. increased interstitial ma...,pleura/pleural space: normal.
1890,1891,-1,tubes and catheters: none. central airways: no...,-1,-1,58,-1,-1,52,-1,-1,190,27,-1,-1,-1,,lungs/pleura/pleural space: low lung volumes w...
1891,1892,-1,tubes and catheters: none. central airways: no...,52,-1,107,-1,-1,-1,-1,-1,137,27,-1,-1,-1,lungs: diffuse patchy opacities within the rig...,pleura/pleural space: normal.
1892,1893,-1,tubes and catheters: none. central airways: no...,-1,-1,58,-1,-1,52,-1,-1,169,27,-1,-1,-1,,lungs/pleura/pleural space: significant increa...


In [9]:
train_df[ [not text.startswith("tube") for text in train_df["text"].astype(str).values.tolist()] ]

Unnamed: 0,note_num,label,text,lung_startpos,pleural1_startpos,pleural2_startpos,pleural3_startpos,pleural4_startpos,pleural5_startpos,pleural6_startpos,heart1_startpos,heart2_startpos,airway1_startpos,thorax1_startpos,thorax2_startpos,bones1_startpos,lung_text,pleura_text
130,131,-1,the lungs are clear. there are calcified granu...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,
178,179,-1,frontal portable view of the chest reveals no ...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,
183,184,-1,there are patchy airspace infiltrates within t...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,
188,189,-1,frontal view of the chest reveals no evidence ...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,
236,237,-1,"the lungs are hypoventilated. there is mild, p...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,1796,-1,multifocal airspace consolidations are noted i...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,
1797,1798,-1,the cardiomediastinal silhouette is within nor...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,
1862,1863,-1,patient rotation degrades image quality and de...,211,-1,445,-1,-1,-1,-1,-1,506,113,-1,-1,-1,lungs: opacity along the border of the right m...,pleura/pleural space: biapical pleural-parench...
1886,1887,-1,post tavi. post median sternotomy and mediasti...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,


In [10]:
#train_df.query('lung_text == "" & not pleura_text.str.startswith("lung")')
#train_df.query('lung_text == "" & pleura_text == ""')
#train_df.query('pleura_text == ""')
train_df.query('lung_text == "" & pleura_text.str.startswith("lung") & pleura_text.str.contains("normal\.")')["pleura_text"].astype(str).values.tolist()

['lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. no focal consolidative changes. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. ',
 'lungs/pleura/pleural space: normal. skin folds project over the left lung base laterally. ',
 'lungs/pleura/pleural space: normal. ']

In [11]:
[s for s in train_df[train_df["lung_text"].str.len() > 0]["lung_text"].astype(str).values.tolist() if "interstitial thickening" in s]
#interstitial #airspace

['lungs: lungs are hypoexpanded. mild/moderate interstitial thickening involving the bilateral middle and lower lobe middle and lower lung zones. no focal opacities or consolidations. ',
 'lungs: diffuse hazy pulmonary opacities and mild interstitial thickening. left base hypoaeration. ',
 'lungs: interval development of patchy opacities within the left lower lobe. stable coarse interstitial thickening and biapical pleural parenchymal scarring. ']

In [12]:
[s for s in train_df[train_df["pleura_text"].str.len() > 0]["pleura_text"].astype(str).values.tolist() if "interstitial thickening" in s]

['lungs/pleura/pleural space: mild interstitial thickening as can be seen in the setting of edema. ']

In [13]:
[s for s in train_df.query('lung_text == "" & pleura_text == ""')["text"].astype(str).values.tolist() if "interstitial thickening" in s]

[]

In [14]:
def word_count(text):
    split_pos = text.find(":")
    return len(text[split_pos+1:].strip().split())

train_df['pleura_wordCount'] = train_df["pleura_text"].apply(word_count)
train_df['lung_wordCount'] = train_df["lung_text"].apply(word_count)

In [15]:
train_df['pleura_wordCount'].describe()

count    1760.000000
mean        5.831818
std         6.866808
min         0.000000
25%         1.000000
50%         4.000000
75%         8.000000
max        56.000000
Name: pleura_wordCount, dtype: float64

In [16]:
train_df[train_df["pleura_wordCount"] == 4]["pleura_text"].astype(str).values.tolist()

['pleura/pleural space: small left pleural fluid. ',
 'pleura/pleural space: small bilateral pleural effusions ',
 'lungs/pleura/pleural space: bilateral prominent interstitial markings ',
 'pleura/pleural space: small bilateral pleural effusions ',
 'pleura/pleural space: small right pleural effusion. ',
 'pleura/pleural space: trace right pleural effusion. ',
 'lungs/pleura/pleural space: diffuse bilateral patchy opacities ',
 'pleura/pleural space: trace left pleural effusion. ',
 'pleura/pleural space: small left pleural effusion. ',
 'pleura/pleural space: small bilateral pleural effusions. ',
 'pleura/pleural space: small bilateral pleural effusions. ',
 'pleura/pleural space: no significant pleural effusion. ',
 'pleura/pleural space: no effusion or pneumothorax. ',
 'pleura/pleural space: no pneumothoraces or effusions ',
 'pleura/pleural space: trace left pleural effusion ',
 'lungs/pleural space: patchy bibasilar airspace opacities. ',
 'pleura/pleural space: small bilateral 

In [17]:
train_df['lung_wordCount'].describe()

count    1760.000000
mean        6.262500
std         7.063616
min         0.000000
25%         0.000000
50%         5.000000
75%        10.000000
max        47.000000
Name: lung_wordCount, dtype: float64

In [18]:
[ s for s in train_df[train_df["lung_wordCount"] == 5]["lung_text"].astype(str).values.tolist()  if "consolidation" in s]

['lungs: no focal opacities or consolidations. ',
 'lungs: extensive bilateral consolidations predominantly peripheral ']

In [19]:
ABSTAIN = -1
NORMAL = 0
ABNORMAL = 1

In [20]:
from snorkel.labeling import labeling_function
import re

word_delim = [" ", "/", "\\", ",", ";"]
delim_pattern = '|'.join(map(re.escape, word_delim))
delim_prog = re.compile(delim_pattern)

def get_sentences(text):
    # Returns list of sentences.
    # Where each sentence is a list of words
    split_pos = text.find(":")
    sent = text[split_pos+1:]
    sents =  [delim_prog.split(s.strip()) for s in sent.split(".") if len(s.strip()) > 0]
    return sents

def is_normal(text):
    sents = get_sentences(text)
    # This there is only one sentence and one word then
    # test if it is the word "normal"
    if len(sents) == 1:
        if len(sents[0]) == 1 and sents[0][0] in ["normal", "negative", "clear"]:
            return NORMAL
        elif len(sents[0]) == 2 and " ".join(sents[0]) == "no change":
            return NORMAL
        else:
            return ABSTAIN
    else:
        return ABSTAIN

def detect_tokens(text, tokens):
    sents = get_sentences(text)
    res = []
    # For each sentence see if any tokens are present
    # If so, naively detect if it is negated
    for sent in sents:
        present = [sent.index(token) for token in tokens if token in sent]
        if len(present) > 0:
            index = min(present)
            negate = sent.index("no") if "no" in sent else -1
            res.append(NORMAL if -1 < negate < index else ABNORMAL)
    if len(res) > 0:
        # Now return an overall call based any calls being abnormal
        return ABNORMAL if sum(res) > 0 else NORMAL
    else:
        # If no tokens present at all
        return ABSTAIN

def detect_phrases(text, phrases):
    split_pos = text.find(":")
    sent = text[split_pos+1:]
    sents =  [s.strip() for s in sent.split(".") if len(s.strip()) > 0]
    res = []
    # See if phrases are present in text
    # If so, naively detect if it is negated
    for phrase in phrases:
        for sent in sents:
            if phrase in sent:
                index = sent.index(phrase)
                # Need space after no to not pick up no as a substring
                negate = sent.index("no ") if "no " in sent else -1
                res.append(NORMAL if -1 < negate < index else ABNORMAL)
    if len(res) > 0:
        # Now return an overall call based any calls being abnormal
        return ABNORMAL if sum(res) > 0 else NORMAL
    else:
        # If no tokens present at all
        return ABSTAIN

@labeling_function()
def lung_normal(x):
    # if lung_text is empty, check is pleura_text
    # includes lung information
    text = x.lung_text
    if text == "" and x.pleura_text.startswith("lung"):
        text = x.pleura_text
    return is_normal(text)

@labeling_function()
def pleura_normal(x):
    return is_normal(x.pleura_text)

@labeling_function()
def detect_infiltrates(x):
    # if lung_text is empty, check is pleura_text
    # includes lung information
    text = x.lung_text
    if text == "" and x.pleura_text.startswith("lung"):
        text = x.pleura_text
    if x.lung_text == "" and  x.pleura_text == "":
        text = x.text
    return detect_tokens(text, ["infiltrate", "infiltrates"])

@labeling_function()
def detect_emphysema(x):
    # if lung_text is empty, check is pleura_text
    # includes lung information
    text = x.lung_text
    if x.lung_text == "" and x.pleura_text.startswith("lung"):
        text = x.pleura_text
    if x.lung_text == "" and  x.pleura_text == "":
        text = x.text
    return detect_tokens(text, ["emphysema", "emphysematous"])

@labeling_function()
def detect_interstitial_prominence(x):
    # if lung_text is empty, check if pleura_text
    # includes lung information
    text = x.lung_text
    if text == "" and x.pleura_text.startswith("lung"):
        text = x.pleura_text
    if x.lung_text == "" and  x.pleura_text == "":
        text = x.text
    return detect_phrases(text, ["interstitial prominence", "interstitial marking", "interstitial thickening"])

@labeling_function()
def detect_effusions(x):
    # Very few instances do not have parsed pleura_text.
    # In those cases use the full text
    text = x.pleura_text if x.pleura_text != "" else x.text
    return detect_tokens(text, ["effusion", "effusions"])

@labeling_function()
def detect_fluid(x):
    # Very few instances do not have parsed pleura_text.
    # In those cases use the full text
    text = x.pleura_text if x.pleura_text != "" else x.text
    return detect_tokens(text, ["fluid", "fluids"])

@labeling_function()
def detect_opacities(x):
    # Very few instances do not have parsed lung_text or pleura_text.
    # In those cases use the full text
    notes = []
    if x.lung_text == "" and  x.pleura_text == "":
        notes.append(x.text)
    else:
        notes = [x.lung_text, x.pleura_text]
    res = []
    for text in notes:
        call = detect_tokens(text, ["opacity", "opacities", "opacification", "opacifications"])
        if call != ABSTAIN:
            res.append(call)
    if len(res) == 0:
        return ABSTAIN
    elif any(res):
        return ABNORMAL
    else:
        return NORMAL

@labeling_function()
def detect_consolidation(x):
    # Very few instances do not have parsed lung_text or pleura_text.
    # In those cases use the full text
    notes = []
    if x.lung_text == "" and  x.pleura_text == "":
        notes.append(x.text)
    else:
        notes = [x.lung_text, x.pleura_text]
    res = []
    for text in notes:
        call = detect_tokens(text, ["consolidation", "consolidations", "consolidative"])
        if call != ABSTAIN:
            res.append(call)
    if len(res) == 0:
        return ABSTAIN
    elif any(res):
        return ABNORMAL
    else:
        return NORMAL

@labeling_function()
def detect_pneumothorax(x):
    # Very few instances do not have parsed lung_text or pleura_text.
    # In those cases use the full text
    notes = []
    if x.lung_text == "" and  x.pleura_text == "":
        notes.append(x.text)
    else:
        notes = [x.lung_text, x.pleura_text]
    res = []
    for text in notes:
        call = detect_tokens(text, ["pneumothorax"])
        if call != ABSTAIN:
            res.append(call)
    if len(res) == 0:
        return ABSTAIN
    elif any(res):
        return ABNORMAL
    else:
        return NORMAL


@labeling_function()
def detect_disease(x):
    # Very few instances do not have parsed lung_text or pleura_text.
    # In those cases use the full text
    notes = []
    if x.lung_text == "" and  x.pleura_text == "":
        notes.append(x.text)
    else:
        notes = [x.lung_text, x.pleura_text]
    res = []
    for text in notes:
        call = detect_phrases(text, ["lung disease", "airspace disease", "parenchymal disease"])
        if call != ABSTAIN:
            res.append(call)
    if len(res) == 0:
        return ABSTAIN
    elif any(res):
        return ABNORMAL
    else:
        return NORMAL

In [21]:
from snorkel.labeling import PandasLFApplier

lfs = [lung_normal, pleura_normal, detect_disease, detect_infiltrates, detect_emphysema, detect_interstitial_prominence,
       detect_effusions, detect_fluid, detect_opacities, detect_consolidation, detect_pneumothorax]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train_df)


100%|██████████| 1760/1760 [00:00<00:00, 11383.35it/s]


In [22]:
L_train

array([[-1, -1, -1, ...,  1, -1, -1],
       [-1,  0, -1, ..., -1, -1, -1],
       [-1, -1, -1, ...,  1, -1, -1],
       ...,
       [-1,  0, -1, ...,  1, -1, -1],
       [-1, -1,  1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1,  0]])

In [23]:
x = train_df.iloc[309]
x

note_num                                                           435
label                                                               -1
text                 tubes and catheters: none. central airways: no...
lung_startpos                                                       52
pleural1_startpos                                                   -1
pleural2_startpos                                                   67
pleural3_startpos                                                   -1
pleural4_startpos                                                   -1
pleural5_startpos                                                   -1
pleural6_startpos                                                   -1
heart1_startpos                                                     -1
heart2_startpos                                                    136
airway1_startpos                                                    27
thorax1_startpos                                                    -1
thorax

In [24]:
print("lung_normal: %d" % lung_normal(x))
print("pleura_normal: %d" % pleura_normal(x))
print("detect_disease: %d" % detect_disease(x))
print("detect_infiltrates: %d" % detect_infiltrates(x))
print("detect_emphysema: %d" % detect_emphysema(x))
print("detect_effusions: %d" % detect_effusions(x))
print("detect_fluid: %d" % detect_fluid(x))
print("detect_opacities: %d" % detect_opacities(x))
print("detect_consolidation: %d" % detect_consolidation(x))
print("detect_pneumothorax: %d" % detect_pneumothorax(x))
print("detect_interstitial_prominence: %d" % detect_interstitial_prominence(x))

lung_normal: 0
pleura_normal: -1
detect_disease: 1
detect_infiltrates: -1
detect_emphysema: -1
detect_effusions: -1
detect_fluid: -1
detect_opacities: -1
detect_consolidation: -1
detect_pneumothorax: -1
detect_interstitial_prominence: -1


In [25]:
x.pleura_text

'pleura/pleural space: patchy airspace disease scattered bilaterally. '

In [26]:
x.lung_text

'lungs: normal. '

In [27]:
x.text

'tubes and catheters: none. central airways: normal. lungs: normal. pleura/pleural space: patchy airspace disease scattered bilaterally. heart and mediastinum: unchanged cardiomediastinal silhouette. additional findings: no acute or aggressive osseous changes noted.'

In [28]:
output = L_train.sum(axis=1)

In [29]:
output[output == -11].shape

(31,)

In [30]:
[i for i, x in enumerate(output == -11) if x]

[111,
 203,
 254,
 357,
 463,
 567,
 668,
 680,
 755,
 914,
 971,
 1009,
 1067,
 1141,
 1158,
 1242,
 1282,
 1308,
 1317,
 1319,
 1423,
 1442,
 1451,
 1464,
 1469,
 1472,
 1509,
 1545,
 1555,
 1649,
 1664]

In [31]:
coverage_lung_normal, coverage_pleura_normal, coverage_detect_disease, coverage_detect_infiltrates, coverage_detect_emphysema, coverage_detect_interstitial_prominence, coverage_detect_effusions, coverage_detect_fluid, coverage_detect_opacities, coverage_detect_consolidation, coverage_detect_pneumothorax = (L_train != ABSTAIN).mean(axis=0)
print(f"lung_normal coverage: {coverage_lung_normal * 100:.1f}%")
print(f"pleura_normal coverage: {coverage_pleura_normal * 100:.1f}%")
print(f"detect_disease coverage: {coverage_detect_disease * 100:.1f}%")
print(f"detect_infiltrates coverage: {coverage_detect_infiltrates * 100:.1f}%")
print(f"detect_emphysema coverage: {coverage_detect_emphysema * 100:.1f}%")
print(f"detect_interstitial_prominence coverage: {coverage_detect_interstitial_prominence * 100:.1f}%")
print(f"detect_effusions coverage: {coverage_detect_effusions * 100:.1f}%")
print(f"detect_fluid coverage: {coverage_detect_fluid * 100:.1f}%")
print(f"detect_opacities coverage: {coverage_detect_opacities * 100:.1f}%")
print(f"detect_consolidation coverage: {coverage_detect_consolidation * 100:.1f}%")
print(f"detect_pneumothorax coverage: {coverage_detect_pneumothorax * 100:.1f}%")

lung_normal coverage: 7.8%
pleura_normal coverage: 40.2%
detect_disease coverage: 4.0%
detect_infiltrates coverage: 0.7%
detect_emphysema coverage: 0.9%
detect_interstitial_prominence coverage: 9.9%
detect_effusions coverage: 21.8%
detect_fluid coverage: 10.3%
detect_opacities coverage: 73.6%
detect_consolidation coverage: 6.0%
detect_pneumothorax coverage: 22.4%


In [32]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lung_normal,0,[0],0.078409,0.076705,0.003409
pleura_normal,1,[0],0.402273,0.369886,0.278409
detect_disease,2,"[0, 1]",0.039773,0.034659,0.028409
detect_infiltrates,3,"[0, 1]",0.007386,0.004545,0.004545
detect_emphysema,4,[1],0.009091,0.007955,0.006818
detect_interstitial_prominence,5,"[0, 1]",0.098864,0.092045,0.057955
detect_effusions,6,"[0, 1]",0.217614,0.213068,0.122727
detect_fluid,7,"[0, 1]",0.102841,0.102273,0.068182
detect_opacities,8,"[0, 1]",0.735795,0.560227,0.414773
detect_consolidation,9,"[0, 1]",0.060227,0.049432,0.027841


In [33]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.413]
INFO:root:[100 epochs]: TRAIN:[loss=0.024]
INFO:root:[200 epochs]: TRAIN:[loss=0.012]
INFO:root:[300 epochs]: TRAIN:[loss=0.011]
INFO:root:[400 epochs]: TRAIN:[loss=0.011]
100%|██████████| 500/500 [00:00<00:00, 5759.56epoch/s]
INFO:root:Finished Training


In [34]:
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.utils import probs_to_preds

train_probs = label_model.predict_proba(L_train)

train_df_filtered, train_probs_filtered = filter_unlabeled_dataframe(
    X=train_df, y=train_probs, L=L_train
)

train_preds = probs_to_preds(train_probs_filtered)

In [35]:
from snorkel.labeling.model import MajorityLabelVoter

# compare to majority vote as baseline
majority_model = MajorityLabelVoter()
train_maj_preds = majority_model.predict(L=L_train)

train_df_maj = train_df.copy()
train_df_maj["label"] = train_maj_preds
#train_df_maj_filtered = train_df_maj[train_df_maj["label"] != -1]

train_df_maj["label"].value_counts()

 1    625
-1    581
 0    554
Name: label, dtype: int64

In [36]:
train_df_maj

Unnamed: 0,note_num,label,text,lung_startpos,pleural1_startpos,pleural2_startpos,pleural3_startpos,pleural4_startpos,pleural5_startpos,pleural6_startpos,heart1_startpos,heart2_startpos,airway1_startpos,thorax1_startpos,thorax2_startpos,bones1_startpos,lung_text,pleura_text,pleura_wordCount,lung_wordCount
124,125,1,tubes and catheters: none. central airways: no...,-1,-1,58,-1,-1,52,-1,-1,285,27,-1,-1,-1,,lungs/pleura/pleural space: interstitial promi...,24,0
125,126,-1,tubes and catheters: none. central airways: no...,52,-1,159,-1,-1,-1,-1,-1,189,27,-1,-1,-1,lungs: bilateral lower lobe infiltrates on thi...,pleura/pleural space: normal.,1,14
126,127,1,tubes and catheters: none. lungs/pleura/pleura...,-1,-1,33,-1,-1,27,-1,-1,97,-1,-1,-1,-1,,lungs/pleura/pleural space: normal lung volume...,5,0
127,128,1,tubes and catheters: none. central airways: no...,-1,58,-1,-1,-1,-1,52,-1,162,27,-1,-1,-1,,lungs/pleura: bibasilar patchy airspace opacit...,11,0
128,129,1,tubes and catheters: none. central airways: no...,-1,-1,58,-1,-1,52,-1,-1,139,27,-1,-1,-1,,lungs/pleura/pleural space: increasing airspac...,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,1890,-1,tubes and catheters: none. central airways: no...,52,-1,160,-1,-1,-1,-1,-1,190,27,-1,-1,-1,lungs: hypoaeration. increased interstitial ma...,pleura/pleural space: normal.,1,13
1890,1891,1,tubes and catheters: none. central airways: no...,-1,-1,58,-1,-1,52,-1,-1,190,27,-1,-1,-1,,lungs/pleura/pleural space: low lung volumes w...,15,0
1891,1892,-1,tubes and catheters: none. central airways: no...,52,-1,107,-1,-1,-1,-1,-1,137,27,-1,-1,-1,lungs: diffuse patchy opacities within the rig...,pleura/pleural space: normal.,1,7
1892,1893,1,tubes and catheters: none. central airways: no...,-1,-1,58,-1,-1,52,-1,-1,169,27,-1,-1,-1,,lungs/pleura/pleural space: significant increa...,13,0


In [37]:
len(train_preds)

1729

In [38]:
labeled_df = pd.DataFrame({
    "note_num": train_df_filtered["note_num"],
    "label": train_preds,
    "text": train_df_filtered["text"]
})

In [39]:
labeled_df

Unnamed: 0,note_num,label,text
124,125,1,tubes and catheters: none. central airways: no...
125,126,0,tubes and catheters: none. central airways: no...
126,127,1,tubes and catheters: none. lungs/pleura/pleura...
127,128,1,tubes and catheters: none. central airways: no...
128,129,1,tubes and catheters: none. central airways: no...
...,...,...,...
1889,1890,0,tubes and catheters: none. central airways: no...
1890,1891,1,tubes and catheters: none. central airways: no...
1891,1892,0,tubes and catheters: none. central airways: no...
1892,1893,0,tubes and catheters: none. central airways: no...


In [40]:
labeled_df["label"].value_counts()

1    998
0    731
Name: label, dtype: int64

In [41]:
import numpy as np

def compute_label(x):
    # If all abstain, then abstain
    if all([score == -1 for score in x]):
        return -1
    # If any abnormal then output abnormal label
    elif any([score == 1 for score in x if score != -1]):
        return 1
    # If all normal then output normal label
    elif all([score == 0 for score in x if score != -1]):
        return 0
    # ERROR code (this shouldn't happen)
    else:
        return -99

manual_preds = np.apply_along_axis(compute_label, axis=1, arr=L_train)

In [42]:
np.unique(manual_preds, return_counts=True)

(array([-1,  0,  1]), array([  31,  338, 1391]))

In [43]:
manual_labeled_df = pd.DataFrame({
    "note_num": train_df["note_num"],
    "label": manual_preds,
    "text": train_df["text"]
}).query("label != -1")

manual_labeled_df

Unnamed: 0,note_num,label,text
124,125,1,tubes and catheters: none. central airways: no...
125,126,1,tubes and catheters: none. central airways: no...
126,127,1,tubes and catheters: none. lungs/pleura/pleura...
127,128,1,tubes and catheters: none. central airways: no...
128,129,1,tubes and catheters: none. central airways: no...
...,...,...,...
1889,1890,1,tubes and catheters: none. central airways: no...
1890,1891,1,tubes and catheters: none. central airways: no...
1891,1892,1,tubes and catheters: none. central airways: no...
1892,1893,1,tubes and catheters: none. central airways: no...


In [44]:
manual_labeled_df.to_csv("/Users/acorbett1/Library/CloudStorage/Box-Box/URCoursework/DataScience/NLP_CSC447/project/rule_labelled_training.csv",index=False)

In [47]:
eval_df.to_csv("/Users/acorbett1/Library/CloudStorage/Box-Box/URCoursework/DataScience/NLP_CSC447/project/eval.csv",index=False)