## Set Up

In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

# change path
%cd /content/drive/MyDrive/Medical-Report-Generator/Notebooks

Mounted at /content/drive
/content/drive/MyDrive/Medical-Report-Generator/Notebooks


## Exploratory Data Analysis

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [3]:
FILE_PATH = "../Data/raw_medical_data.csv"

df = pd.read_csv(FILE_PATH)

df.head()

Unnamed: 0,id,AccessionNumber,StudyInstanceUid,PatientSex,PatientAge,StudyDate,StudyDescription,Indication,Comparison,Findings,Impression
0,pGRDNLZHK1CJMB9DS_aGRDNLD4ATLU63FN8_s1.2.826.0...,GRDNLD4ATLU63FN8,1.2.826.0.1.3680043.8.498.98869902613028373370...,M,007Y,20100722,DG CHEST 2V,"Cough, fever",,No active infiltrate or effusion is seen. Only...,No pneumonia. Mild peribronchial thickening.
1,pGRDN558B6N940827_aGRDNJRM2EFFASEDX_s1.2.826.0...,GRDNJRM2EFFASEDX,1.2.826.0.1.3680043.8.498.82596686537958139226...,M,057Y,20160114,DG CHEST 2V,Productive cough for 1 month. Smoker.,,Heart size is normal. There is no pleural effu...,1. No acute cardiopulmonary abnormalities. 2. ...
2,pGRDNS5TAGQ36EJXU_aGRDNAT079HPWLQGQ_s1.2.826.0...,GRDNAT079HPWLQGQ,1.2.826.0.1.3680043.8.498.45885543378002630863...,F,061Y,20190327,DG CHEST 2V,Midsternal pain 2 hours.,Comparison with prior exam from 04/10/2015.,Lungs are adequately inflated without focal ai...,No active cardiopulmonary disease.
3,pGRDN209D4EU5I2C2_aGRDNM9VVCI1D9171_s1.2.826.0...,GRDNM9VVCI1D9171,1.2.826.0.1.3680043.8.498.66274883271641231571...,M,007M,20160821,DG CHEST 2V,"Tachypnea, wheezing",,Upper normal heart size. Normal mediastinal co...,No acute abnormalities.
4,pGRDN5UYSEA2UG5PD_aGRDN3L6AL6JHT3EF_s1.2.826.0...,GRDN3L6AL6JHT3EF,1.2.826.0.1.3680043.8.498.79638437312984318878...,F,036Y,20140727,DG CHEST 2V,Cough and chest pain for 3 days,None.,The heart and pulmonary vascularity are within...,No acute abnormality noted.


In [4]:
df.shape

(140000, 11)

In [5]:
df.dtypes

Unnamed: 0,0
id,object
AccessionNumber,object
StudyInstanceUid,object
PatientSex,object
PatientAge,object
StudyDate,int64
StudyDescription,object
Indication,object
Comparison,object
Findings,object


In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.isnull().sum()

Unnamed: 0,0
id,0
AccessionNumber,0
StudyInstanceUid,0
PatientSex,3
PatientAge,7012
StudyDate,0
StudyDescription,1
Indication,37
Comparison,9825
Findings,0


Check token length of summaries.

## Data Cleaning

In [8]:
# Column names
COL_INDICATION   = "Indication"
COL_COMPARISON   = "Comparison"
COL_FINDINGS     = "Findings"
COL_STUDY_DESC   = "StudyDescription"
COL_TARGET       = "Impression"

# Fill missing values in Indication, Comparison, Study_Description

df[COL_INDICATION] = df[COL_INDICATION].fillna("None.")
df[COL_COMPARISON] = df[COL_COMPARISON].fillna("None.")
df[COL_STUDY_DESC] = df[COL_STUDY_DESC].fillna("None.")


print("Missing values after cleaning:")
print(df.isnull().sum())

df.head(10)

Missing values after cleaning:
id                     0
AccessionNumber        0
StudyInstanceUid       0
PatientSex             3
PatientAge          7012
StudyDate              0
StudyDescription       0
Indication             0
Comparison             0
Findings               0
Impression             0
dtype: int64


Unnamed: 0,id,AccessionNumber,StudyInstanceUid,PatientSex,PatientAge,StudyDate,StudyDescription,Indication,Comparison,Findings,Impression
0,pGRDNLZHK1CJMB9DS_aGRDNLD4ATLU63FN8_s1.2.826.0...,GRDNLD4ATLU63FN8,1.2.826.0.1.3680043.8.498.98869902613028373370...,M,007Y,20100722,DG CHEST 2V,"Cough, fever",None.,No active infiltrate or effusion is seen. Only...,No pneumonia. Mild peribronchial thickening.
1,pGRDN558B6N940827_aGRDNJRM2EFFASEDX_s1.2.826.0...,GRDNJRM2EFFASEDX,1.2.826.0.1.3680043.8.498.82596686537958139226...,M,057Y,20160114,DG CHEST 2V,Productive cough for 1 month. Smoker.,None.,Heart size is normal. There is no pleural effu...,1. No acute cardiopulmonary abnormalities. 2. ...
2,pGRDNS5TAGQ36EJXU_aGRDNAT079HPWLQGQ_s1.2.826.0...,GRDNAT079HPWLQGQ,1.2.826.0.1.3680043.8.498.45885543378002630863...,F,061Y,20190327,DG CHEST 2V,Midsternal pain 2 hours.,Comparison with prior exam from 04/10/2015.,Lungs are adequately inflated without focal ai...,No active cardiopulmonary disease.
3,pGRDN209D4EU5I2C2_aGRDNM9VVCI1D9171_s1.2.826.0...,GRDNM9VVCI1D9171,1.2.826.0.1.3680043.8.498.66274883271641231571...,M,007M,20160821,DG CHEST 2V,"Tachypnea, wheezing",None.,Upper normal heart size. Normal mediastinal co...,No acute abnormalities.
4,pGRDN5UYSEA2UG5PD_aGRDN3L6AL6JHT3EF_s1.2.826.0...,GRDN3L6AL6JHT3EF,1.2.826.0.1.3680043.8.498.79638437312984318878...,F,036Y,20140727,DG CHEST 2V,Cough and chest pain for 3 days,None.,The heart and pulmonary vascularity are within...,No acute abnormality noted.
5,pGRDNELNQSW483H3Y_aGRDNKS32FO0MWO54_s1.2.826.0...,GRDNKS32FO0MWO54,1.2.826.0.1.3680043.8.498.94602140310111155555...,F,029Y,20080309,DG CHEST 2V,"28 year-old female, lymphoma currently on chem...",Comparison: 03/10/06.,"Low lung volumes without acute consolidation, ...",1. Low volume exam without acute airspace proc...
6,pGRDNJYHWIBF0I8R1_aGRDNJCNS2WXKC51E_s1.2.826.0...,GRDNJCNS2WXKC51E,1.2.826.0.1.3680043.8.498.90360658723187858198...,M,050Y,20200725,DG CHEST 1V PORT,Preoperative exam.,02/05/2018.,The heart size and mediastinal contours are wi...,No active disease.
7,pGRDNS282KQMLYD8W_aGRDNPY687L2R5E4Y_s1.2.826.0...,GRDNPY687L2R5E4Y,1.2.826.0.1.3680043.8.498.45377335160322619016...,M,073Y,20231222,[ID],Fall. Syncope; fall; hx of open heart sx; pt s...,"Comparisons to January 16, 2018.",Worsened multifocal bilateral mid lower lung p...,Worsening mid and lower lung predominant airsp...
8,pGRDNUUKG19JGCSMQ_aGRDN53TO1ISBNW1D_s1.2.826.0...,GRDN53TO1ISBNW1D,1.2.826.0.1.3680043.8.498.75128803173103393749...,M,030Y,20160126,DG CHEST 2V,"Chest pain, short of breath",Comparison: 5/9/2009,Normal mediastinum and cardiac silhouette. Rig...,Chronic atelectasis versus less likely infecti...
9,pGRDNLXEHE3IZRH6C_aGRDNRDHGJ75WD20Y_s1.2.826.0...,GRDNRDHGJ75WD20Y,1.2.826.0.1.3680043.8.498.31561878357566563614...,M,035Y,20141212,DG CHEST 2V,Chest tightness.,None available at time of study interpretation.,The cardiomediastinal silhouette is unremarkab...,No acute cardiopulmonary process.


In [9]:
# Create source and target columns

def build_source_row(row):
    return " ".join([
        f"[INDICATION] {row[COL_INDICATION]}",
        f"[COMPARISON] {row[COL_COMPARISON]}",
        f"[FINDINGS] {row[COL_FINDINGS]}",
    ])

df["source_text"] = df.apply(build_source_row, axis=1)
df["target_text"] = df[COL_TARGET].astype(str)

df

Unnamed: 0,id,AccessionNumber,StudyInstanceUid,PatientSex,PatientAge,StudyDate,StudyDescription,Indication,Comparison,Findings,Impression,source_text,target_text
0,pGRDNLZHK1CJMB9DS_aGRDNLD4ATLU63FN8_s1.2.826.0...,GRDNLD4ATLU63FN8,1.2.826.0.1.3680043.8.498.98869902613028373370...,M,007Y,20100722,DG CHEST 2V,"Cough, fever",None.,No active infiltrate or effusion is seen. Only...,No pneumonia. Mild peribronchial thickening.,"[INDICATION] Cough, fever [COMPARISON] None. [...",No pneumonia. Mild peribronchial thickening.
1,pGRDN558B6N940827_aGRDNJRM2EFFASEDX_s1.2.826.0...,GRDNJRM2EFFASEDX,1.2.826.0.1.3680043.8.498.82596686537958139226...,M,057Y,20160114,DG CHEST 2V,Productive cough for 1 month. Smoker.,None.,Heart size is normal. There is no pleural effu...,1. No acute cardiopulmonary abnormalities. 2. ...,[INDICATION] Productive cough for 1 month. Smo...,1. No acute cardiopulmonary abnormalities. 2. ...
2,pGRDNS5TAGQ36EJXU_aGRDNAT079HPWLQGQ_s1.2.826.0...,GRDNAT079HPWLQGQ,1.2.826.0.1.3680043.8.498.45885543378002630863...,F,061Y,20190327,DG CHEST 2V,Midsternal pain 2 hours.,Comparison with prior exam from 04/10/2015.,Lungs are adequately inflated without focal ai...,No active cardiopulmonary disease.,[INDICATION] Midsternal pain 2 hours. [COMPARI...,No active cardiopulmonary disease.
3,pGRDN209D4EU5I2C2_aGRDNM9VVCI1D9171_s1.2.826.0...,GRDNM9VVCI1D9171,1.2.826.0.1.3680043.8.498.66274883271641231571...,M,007M,20160821,DG CHEST 2V,"Tachypnea, wheezing",None.,Upper normal heart size. Normal mediastinal co...,No acute abnormalities.,"[INDICATION] Tachypnea, wheezing [COMPARISON] ...",No acute abnormalities.
4,pGRDN5UYSEA2UG5PD_aGRDN3L6AL6JHT3EF_s1.2.826.0...,GRDN3L6AL6JHT3EF,1.2.826.0.1.3680043.8.498.79638437312984318878...,F,036Y,20140727,DG CHEST 2V,Cough and chest pain for 3 days,None.,The heart and pulmonary vascularity are within...,No acute abnormality noted.,[INDICATION] Cough and chest pain for 3 days [...,No acute abnormality noted.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139995,pGRDNXBTSJF476I2N_aGRDNOXAQ28KRW4GZ_s1.2.826.0...,GRDNOXAQ28KRW4GZ,1.2.826.0.1.3680043.8.498.24718307818624585717...,F,025Y,20221015,DG CHEST 2V,"Cough, fever",None.,The heart size and mediastinal contours are wi...,No acute abnormality of the lungs.,"[INDICATION] Cough, fever [COMPARISON] None. [...",No acute abnormality of the lungs.
139996,pGRDN1GWNQ8GR6XX8_aGRDNNUZ50U648VR6_s1.2.826.0...,GRDNNUZ50U648VR6,1.2.826.0.1.3680043.8.498.84987404802016573120...,F,,20111024,Chest 1 View,Pna,None.,EKG leads project over the patient's chest. Th...,Probable vascular congestion and bilateral inf...,[INDICATION] Pna [COMPARISON] None. [FINDINGS]...,Probable vascular congestion and bilateral inf...
139997,pGRDN518BHGVULJRC_aGRDN6QZEQ1QT8YF9_s1.2.826.0...,GRDN6QZEQ1QT8YF9,1.2.826.0.1.3680043.8.498.43036822772610310582...,M,000D,20160729,DG CHEST PORT W/ABD NEONATE,Status post endotracheal intubation,None.,Cardiac shadow is within normal limits. The lu...,Endotracheal tube at the level of the carina d...,[INDICATION] Status post endotracheal intubati...,Endotracheal tube at the level of the carina d...
139998,pGRDNMUS16DV29OCF_aGRDN0S46J1DRAHXG_s1.2.826.0...,GRDN0S46J1DRAHXG,1.2.826.0.1.3680043.8.498.26144389355329300559...,F,017Y,20150817,DG ABDOMEN ACUTE W/ 1V CHEST,Diffuse abdominal pain.,None.,There is no evidence of dilated bowel loops or...,Negative abdominal radiographs. No acute cardi...,[INDICATION] Diffuse abdominal pain. [COMPARIS...,Negative abdominal radiographs. No acute cardi...


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [10]:
# check token length of source text column
df["source_text"].apply(lambda x: len(x.split())).describe()

Unnamed: 0,source_text
count,140000.0
mean,43.779964
std,18.486596
min,9.0
25%,30.0
50%,40.0
75%,52.0
max,311.0


In [11]:
# check token length of target text column
df["target_text"].apply(lambda x: len(x.split())).describe()

Unnamed: 0,target_text
count,140000.0
mean,11.63715
std,11.417352
min,1.0
25%,4.0
50%,7.0
75%,15.0
max,164.0


Save Clean Data Split

In [12]:
# Split into Train, Val, Test sets and Save CSV
from pathlib import Path

SAVE_DIR = Path("../Data")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

N = len(df)
rng = np.random.default_rng(42)
perm = rng.permutation(N) # shuffle data

p_train, p_val, p_test = 0.80, 0.10, 0.10

n_train = int(round(N * p_train))
n_val   = int(round(N * p_val))
n_test  = N - n_train - n_val

train_idx = perm[:n_train]
val_idx   = perm[n_train:n_train + n_val]
test_idx  = perm[n_train + n_val:]

train_df = df.iloc[train_idx][["source_text", "target_text"]].reset_index(drop=True)
val_df   = df.iloc[val_idx][["source_text", "target_text"]].reset_index(drop=True)
test_df  = df.iloc[test_idx][["source_text", "target_text"]].reset_index(drop=True)

train_path = SAVE_DIR / "train.csv"
val_path   = SAVE_DIR / "val.csv"
test_path  = SAVE_DIR / "test.csv"

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Saved: {train_path} ({len(train_df)})")
print(f"Saved: {val_path}   ({len(val_df)})")
print(f"Saved: {test_path}  ({len(test_df)})")


Saved: ../Data/train.csv (112000)
Saved: ../Data/val.csv   (14000)
Saved: ../Data/test.csv  (14000)


In [13]:
train_df

Unnamed: 0,source_text,target_text
0,[INDICATION] Followup pneumonia. Cough. [COMPA...,1. Bilateral lower lobe airspace disease most ...
1,[INDICATION] Check tub placement [COMPARISON] ...,Nasogastric tube tip in the distal esophagus. ...
2,[INDICATION] Lung lesion [COMPARISON] Comparis...,No change.
3,[INDICATION] Left-sided chest pain and shortne...,No active disease.
4,[INDICATION] Chest pain [COMPARISON] None. [FI...,No radiographic evidence of acute cardiopulmon...
...,...,...
111995,[INDICATION] Fever and shortness of breath. Co...,No edema or consolidation.
111996,[INDICATION] Chest pain and shortness of breat...,Cardiac enlargement with vascular congestion. ...
111997,[INDICATION] Chest pain [COMPARISON] None. [FI...,No acute cardiopulmonary process.
111998,"[INDICATION] Dizziness, syncope, cough common ...",There is no active cardiopulmonary disease.


In [14]:
test_df

Unnamed: 0,source_text,target_text
0,[INDICATION] Preoperative chest x-ray for hyst...,No evidence for active chest disease.
1,[INDICATION] Heart palpitations. [COMPARISON] ...,No acute cardiopulmonary process.
2,[INDICATION] Cough [COMPARISON] Comparison is ...,No acute cardiopulmonary disease.
3,[INDICATION] Prior ultrasound with hydronephro...,No definite vesicoureteral reflux identified.
4,[INDICATION] Chest pain [COMPARISON] Compariso...,Left base linear atelectasis.
...,...,...
13995,[INDICATION] Cough and right-sided rib pain [C...,No active cardiopulmonary disease.
13996,[INDICATION] Chest pain beginning at 5 p.m. to...,Negative chest.
13997,[INDICATION] Fever. Chills. [COMPARISON] Compa...,Normal and stable chest.
13998,[INDICATION] Atrial fibrillation. Decreased ox...,No acute cardiopulmonary disease. Possible COPD.


In [15]:
val_df

Unnamed: 0,source_text,target_text
0,[INDICATION] History of chronic coughing. [COM...,Question minimal hyperinflation. Heart size is...
1,[INDICATION] Status post shortness of breath. ...,Borderline cardiomegaly with diffuse interstit...
2,[INDICATION] Shortness of breath for several d...,Chronic changes without acute abnormality.
3,[INDICATION] Unstable premature newborn; 35 we...,RDS.
4,"[INDICATION] Low BP, patient very lethargic. [...",Cardiomegaly with mild congestive heart failur...
...,...,...
13995,[INDICATION] Hypoxia and lung crackles. [COMPA...,Bronchitic airway thickening. No collapse or c...
13996,[INDICATION] Evaluate for foreign body [COMPAR...,Negative exam.
13997,[INDICATION] Dyspnea and posterior right chest...,No active cardiopulmonary process demonstrated.
13998,[INDICATION] Cough and congestion [COMPARISON]...,Bibasilar infiltrates left considerable greate...
