# NBME - Score Clinical Patient Notes - EDA

## Imports and Setup

In [1]:
import numpy as np
import spacy

In [2]:
import pandas as pd

train = pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
test = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
submission = pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

# convert location from list of str to list of tuples of int
location_ints = train.location.str.findall(r'\d+')
assert location_ints.apply(lambda x: len(x) % 2 == 0).all()
train.location = location_ints.apply(lambda x: [(int(x[i]), int(x[i+1])) for i in range(0, len(x), 2)])

# clean feature text by lowercasing, removing OR and -, adding <spn> token to indicate start of patient notes
features["features_clean"] = [text.replace("-OR-", ".-").replace("-", " ").lower() + " <spn> " for text in features.feature_text]

## EDA

In [3]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],"[(696, 724)]"
1,00016_001,0,16,1,"['mom with ""thyroid disease']","[(668, 693)]"
2,00016_002,0,16,2,['chest pressure'],"[(203, 217)]"
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","[(70, 91), (176, 183)]"
4,00016_004,0,16,4,['felt as if he were going to pass out'],"[(222, 258)]"


In [4]:
test.head()

Unnamed: 0,id,case_num,pn_num,feature_num
0,00016_000,0,16,0
1,00016_001,0,16,1
2,00016_002,0,16,2
3,00016_003,0,16,3
4,00016_004,0,16,4


In [5]:
features.head()

Unnamed: 0,feature_num,case_num,feature_text,features_clean
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...,family history of mi. family history of myocar...
1,1,0,Family-history-of-thyroid-disorder,family history of thyroid disorder <spn>
2,2,0,Chest-pressure,chest pressure <spn>
3,3,0,Intermittent-symptoms,intermittent symptoms <spn>
4,4,0,Lightheaded,lightheaded <spn>


In [6]:
patient_notes.head()

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [7]:
submission.head()

Unnamed: 0,id,location
0,00016_000,0 100
1,00016_001,
2,00016_002,200 250;300 400
3,00016_003,
4,00016_004,75 110


In [8]:
PATIENT_IDX = 16
patient_df = train[train["pn_num"] == PATIENT_IDX]
location  = patient_df["location"]
annotation = patient_df["annotation"]
patient_df

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],"[(696, 724)]"
1,00016_001,0,16,1,"['mom with ""thyroid disease']","[(668, 693)]"
2,00016_002,0,16,2,['chest pressure'],"[(203, 217)]"
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","[(70, 91), (176, 183)]"
4,00016_004,0,16,4,['felt as if he were going to pass out'],"[(222, 258)]"
5,00016_005,0,16,5,[],[]
6,00016_006,0,16,6,"['adderall', 'adderrall', 'adderrall']","[(321, 329), (404, 413), (652, 661)]"
7,00016_007,0,16,7,[],[]
8,00016_008,0,16,8,[],[]
9,00016_009,0,16,9,"['palpitations', 'heart beating/pounding']","[(26, 38), (96, 118)]"


In [9]:
start_pos = []
end_pos = []
for loc_list in patient_df.location:
    for loc in loc_list:
        start_pos.append(loc[0])
        end_pos.append(loc[1])
        
ents = []
for start, end in zip(start_pos, end_pos):
    ents.append({
        'start': start, 
        'end' : end,
        'label' : 'L'
    })
doc = {
    'text' : patient_notes[patient_notes["pn_num"] == PATIENT_IDX]["pn_history"].iloc[0],
    "ents" : ents
}
colors = {"L" :"linear-gradient(90deg, #aa9cfc, #fc9ce7)" } 
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options = options , manual=True, jupyter=True);

In [10]:
case_num = patient_df.case_num.unique()
assert case_num.shape == (1, )

In [11]:
for s in features.loc[features.case_num == case_num[0], :].feature_text:
    print(s)

Family-history-of-MI-OR-Family-history-of-myocardial-infarction
Family-history-of-thyroid-disorder
Chest-pressure
Intermittent-symptoms
Lightheaded
No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance
Adderall-use
Shortness-of-breath
Caffeine-use
heart-pounding-OR-heart-racing
Few-months-duration
17-year
Male


In [12]:
features_per_case = features[["case_num", "feature_num"]].groupby("case_num").count()
features_per_case

Unnamed: 0_level_0,feature_num
case_num,Unnamed: 1_level_1
0,13
1,13
2,17
3,16
4,10
5,18
6,12
7,9
8,18
9,17


In [13]:
notes_per_case = patient_notes[["case_num", "pn_num"]].groupby("case_num").count()
notes_per_case

Unnamed: 0_level_0,pn_num
case_num,Unnamed: 1_level_1
0,2268
1,808
2,1958
3,9753
4,5405
5,6909
6,1597
7,4101
8,4196
9,5151


In [14]:
notes_len_per_case = patient_notes[["case_num", "pn_history"]].groupby("case_num").agg(lambda x: x.str.len().sum())
notes_len_per_case

Unnamed: 0_level_0,pn_history
case_num,Unnamed: 1_level_1
0,1902460
1,645938
2,1647816
3,7762865
4,4494825
5,5722454
6,1273274
7,3456688
8,3619150
9,3957410


In [15]:
from functools import reduce

dfs = [features_per_case, notes_per_case, notes_len_per_case]
stats_per_case = reduce(lambda  left, right: pd.merge(left, right, left_index=True, right_index=True), dfs)
stats_per_case.sort_values("pn_num")

Unnamed: 0_level_0,feature_num,pn_num,pn_history
case_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,13,808,645938
6,12,1597,1273274
2,17,1958,1647816
0,13,2268,1902460
7,9,4101,3456688
8,18,4196,3619150
9,17,5151,3957410
4,10,5405,4494825
5,18,6909,5722454
3,16,9753,7762865
