# NBME - Score Clinical Patient Notes - Baseli

## Imports and Setup

In [1]:
import numpy as np

import re

In [2]:
import pandas as pd

train = pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
test = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
submission = pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

# convert location from list of str to list of tuples of int
location_ints = train.location.str.findall(r'\d+')
assert location_ints.apply(lambda x: len(x) % 2 == 0).all()
train.location = location_ints.apply(lambda x: [(int(x[i]), int(x[i+1])) for i in range(0, len(x), 2)])

# clean feature text by lowercasing, removing OR and -, adding <spn> token to indicate start of patient notes
features["features_clean"] = [text.replace("-OR-", ".-").replace("-", " ").lower() + " <spn> " for text in features.feature_text]

## Dummy submissions

### Predicting the whole span

In [3]:
# pn_spans = []
# for pn in test["pn_num"]:
#     pns = patient_notes.loc[patient_notes["pn_num"] == pn, "pn_history"]
#     assert pns.shape == (1,)
#     pn_spans.append(f"0 {len(pns.iloc[0])}")

### Reducing to HPI sections

In [4]:
# pn_spans = []
# for pn in test["pn_num"]:
#     pns = patient_notes.loc[patient_notes["pn_num"] == pn, "pn_history"]
#     assert pns.shape == (1,)
#     matches = [m.start() for m in re.finditer('HPI:', pns.iloc[0])]
#     if len(matches) > 0:
#         start = matches[0]
#         for e in [m.start() for m in re.finditer("\n", pns.iloc[0])]:
#             if e > start:
#                 end = e
#                 break
#         else:
#             end = len(pns.iloc[0])
#         pn_spans.append(f"{start} {end}")
#     else:
#         pn_spans.append(f"0 {len(pns.iloc[0])}")

### Matches for strings in features

In [5]:
pn_spans = []
for i, r in test.iterrows():
    pns = patient_notes.loc[patient_notes["pn_num"] == r["pn_num"], "pn_history"]
    assert pns.shape == (1,)
    history = pns.iloc[0].lower()
    
    feature = features.loc[np.logical_and(features["case_num"] == r["case_num"], features["feature_num"] == r["feature_num"]), "features_clean"]
    assert feature.shape == (1,)
    
    feature_splits = feature.iloc[0].split()
    feature_spans = []
    for i in range(len(feature_splits) - 1):
        split = feature_splits[i]
        matches = ";".join([f"{m.start()} {m.end()}" for m in re.finditer(split, history)])
        if matches != "":
            feature_spans.append(matches)
    pn_spans.append(";".join(feature_spans))

# Submission

In [6]:
test

Unnamed: 0,id,case_num,pn_num,feature_num
0,00016_000,0,16,0
1,00016_001,0,16,1
2,00016_002,0,16,2
3,00016_003,0,16,3
4,00016_004,0,16,4


In [7]:
my_submission = pd.DataFrame(data={
            "id": test["id"], 
            "location": pn_spans
        })
my_submission.to_csv("submission.csv", index=False)
my_submission.head()

Unnamed: 0,id,location
0,00016_000,67 69;92 94;123 125;288 290;439 441;465 467;75...
1,00016_001,67 69;92 94;123 125;288 290;439 441;465 467;67...
2,00016_002,129 134;203 208;209 217
3,00016_003,70 82
4,00016_004,
