In [14]:
import pandas as pd 
import matplotlib.pyplot as plt 
import os
import re

In [15]:
column_names = ["Filename", "Scan Type(s)", "Description", "Bottom Line", "Image URL"]
df = pd.read_csv("brain_scan_dataset.csv", names=column_names)

In [16]:
df.head()

Unnamed: 0,Filename,Scan Type(s),Description,Bottom Line,Image URL
0,Image Name,Scan Type(s),Description,Bottom Line,Image URL
1,image0002.png,"CT, Tomography",F2: Postoperative plain computed tomography sc...,Bottom Line:Ventriculo peritoneal (VP) shunt u...,https://openi.nlm.nih.gov/imgs/150/215/4244800...
2,image0003.png,"Tomography, Magnetic resonance",F1: Brain imaging studies. (a) Pre-treatment b...,Bottom Line:The results of his initial physica...,https://openi.nlm.nih.gov/imgs/150/389/3514352...
3,image0005.png,"CT, Tomography",FI1400007cr-3: Postoperative CT brain scan sho...,Bottom Line:A CT brain scan showed a postopera...,https://openi.nlm.nih.gov/imgs/150/193/4212698...
4,image0001.png,CT,"figure3: On an iOS device, an NCCT brain scan ...",Bottom Line:The interpretations made on an iOS...,https://openi.nlm.nih.gov/imgs/150/147/3221380...


In [17]:
# Removing rows with bad metadata

df.columns = df.columns.str.strip()

clean_df = df[
    (df["Description"] != "No description") &
    (df["Bottom Line"] != "No bottom line") &
    (df["Scan Type(s)"] != "Unknown")
]

In [18]:
print(clean_df.shape[0])
print(df.shape[0])

843
1201


In [19]:
clean_df.head()

Unnamed: 0,Filename,Scan Type(s),Description,Bottom Line,Image URL
0,Image Name,Scan Type(s),Description,Bottom Line,Image URL
1,image0002.png,"CT, Tomography",F2: Postoperative plain computed tomography sc...,Bottom Line:Ventriculo peritoneal (VP) shunt u...,https://openi.nlm.nih.gov/imgs/150/215/4244800...
2,image0003.png,"Tomography, Magnetic resonance",F1: Brain imaging studies. (a) Pre-treatment b...,Bottom Line:The results of his initial physica...,https://openi.nlm.nih.gov/imgs/150/389/3514352...
3,image0005.png,"CT, Tomography",FI1400007cr-3: Postoperative CT brain scan sho...,Bottom Line:A CT brain scan showed a postopera...,https://openi.nlm.nih.gov/imgs/150/193/4212698...
4,image0001.png,CT,"figure3: On an iOS device, an NCCT brain scan ...",Bottom Line:The interpretations made on an iOS...,https://openi.nlm.nih.gov/imgs/150/147/3221380...


In [20]:
# Drop rows with collage pattern like (a), (b), ..., (z), (A), ..., (Z)
pattern = r"\([a-zA-Z]\)"
clean_df = clean_df[~clean_df["Description"].str.contains(pattern, regex=True)]

print(f"🧹 After removing collage-type rows: {len(clean_df)} rows remain.")

🧹 After removing collage-type rows: 523 rows remain.


In [21]:
# Save to CSV (overwrite mode)
clean_df.to_csv("cleaned_brain_scan_dataset.csv", index=False)
print("💾 Saved to 'cleaned_brain_scan_dataset.csv'")

💾 Saved to 'cleaned_brain_scan_dataset.csv'
