In [52]:
import pandas as pd
path= 'Original_csv.csv'
df= pd.read_csv(path)
# df

In [53]:
# 1. Drop unnecessary columns

df_temp = df.drop(columns=['class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max'])

# 2–4. Group by image_id and combine unique class names (case-insensitive)
df_new = (
    df_temp.assign(class_name_lower=df_temp['class_name'].str.lower())
           .groupby(['image_id'])
           .agg({
                'class_name': lambda x: ', '.join(
                    sorted({name.lower(): name for name in x}.values())
                )
           })
           .reset_index()
)

In [54]:
df_new['label'] = df_new['class_name'].apply(
    lambda x: "Normal" if x.strip().lower() == "no finding" else "Abnormal"
)
df_new

Unnamed: 0,image_id,class_name,label
0,000434271f63a053c4128a0ba6352c7f,No finding,Normal
1,00053190460d56c53cc3e57321387478,No finding,Normal
2,0005e8e3701dfb1dd93d53e2ff537b6e,"Consolidation, Infiltration, Lung Opacity, Nod...",Abnormal
3,0006e0a85696f6bb578e84fafa9a5607,No finding,Normal
4,0007d316f756b3fa0baea2ff514ce945,"Aortic enlargement, Cardiomegaly, ILD, Pleural...",Abnormal
...,...,...,...
14995,ffe6f9fe648a7ec29a50feb92d6c15a4,"Aortic enlargement, Cardiomegaly, Other lesion",Abnormal
14996,ffea246f04196af602c7dc123e5e48fc,No finding,Normal
14997,ffeffc54594debf3716d6fcd2402a99f,Aortic enlargement,Abnormal
14998,fff0f82159f9083f3dd1f8967fc54f6a,No finding,Normal


In [55]:
import numpy as np
df_new[df_new['label']=='Abnormal']

Unnamed: 0,image_id,class_name,label
2,0005e8e3701dfb1dd93d53e2ff537b6e,"Consolidation, Infiltration, Lung Opacity, Nod...",Abnormal
4,0007d316f756b3fa0baea2ff514ce945,"Aortic enlargement, Cardiomegaly, ILD, Pleural...",Abnormal
6,000d68e42b71d3eac10ccc077aba07c1,"Aortic enlargement, Lung Opacity, Other lesion...",Abnormal
7,00150343289f317a0ad5629d5b7d9ef9,"Aortic enlargement, Cardiomegaly, Lung Opacity...",Abnormal
9,001d127bad87592efe45a5c7678f8b8d,"Calcification, Pulmonary fibrosis",Abnormal
...,...,...,...
14986,ff924bcbd38f123aec723aa7040d7e43,"Atelectasis, Consolidation, Lung Opacity, Pleu...",Abnormal
14987,ffb5d0b005261ed350f7a08c06613a34,"Aortic enlargement, Cardiomegaly",Abnormal
14991,ffceb71a80efba3b83c88e11f4b9694b,"Cardiomegaly, Pulmonary fibrosis",Abnormal
14995,ffe6f9fe648a7ec29a50feb92d6c15a4,"Aortic enlargement, Cardiomegaly, Other lesion",Abnormal


# selected dataset

In [74]:
# Separate Normal and Abnormal cases
df_normal = df_new[df_new['label'] == "Normal"]
df_abnormal = df_new[df_new['label'] == "Abnormal"]

# Randomly select 4500 Normal rows
df_normal_sampled = df_normal.sample(n=4500, random_state=42)

# Combine all Abnormal + sampled Normal
df_final = pd.concat([df_normal_sampled, df_abnormal], ignore_index=True)

# Shuffle the final dataset
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Add ".dicom" extension to image_id column
df_final['image_id'] = df_final['image_id'].astype(str) + ".dicom"

In [78]:
df_final

Unnamed: 0,image_id,class_name,label
0,1245f853816564c0ee3f1b9e93521ff0.dicom,"Aortic enlargement, Infiltration, Other lesion...",Abnormal
1,19d8fca6a7a5bbc35dc669ef9406ba99.dicom,No finding,Normal
2,52fe2f01573413223b1f7edee17de341.dicom,Pulmonary fibrosis,Abnormal
3,59ab79bb1fce73184425a6572169999e.dicom,No finding,Normal
4,75b4afff533ef3bbced8fdae06ac6511.dicom,No finding,Normal
...,...,...,...
8889,47322c3b3510df3e395773eca7e06dc8.dicom,"Cardiomegaly, Other lesion",Abnormal
8890,285cdfec4d725d327266833adf3ba9d5.dicom,"Aortic enlargement, Cardiomegaly",Abnormal
8891,332f505a735ca0961e7128fc0f166a5c.dicom,"Aortic enlargement, Cardiomegaly",Abnormal
8892,b6b5e7a80a8caf01b7721d63ae0551be.dicom,No finding,Normal


In [79]:
df_final.to_csv('/home/jupyter-nafisha/X-ray/CSVs/Selected_all.csv', index=False)

# train, val and test split

In [80]:
import pandas as pd
df = pd.read_csv("/home/jupyter-nafisha/X-ray/CSVs/Selected_all.csv")

In [81]:
# Split into normal and abnormal
df_normal = df[df["label"] == "Normal"]
df_abnormal = df[df["label"] == "Abnormal"]

print("Normal:", len(df_normal))
print("Abnormal:", len(df_abnormal))

Normal: 4500
Abnormal: 4394


In [82]:
# -------------------------------
# Normal splits
# -------------------------------
normal_train  = df_normal.sample(n=3150, random_state=42)
remaining_normal = df_normal.drop(normal_train.index)

normal_test   = remaining_normal.sample(n=675, random_state=42)
normal_val    = remaining_normal.drop(normal_test.index).sample(n=675, random_state=42)

In [83]:
# -------------------------------
# Abnormal splits
# -------------------------------
ab_train  = df_abnormal.sample(n=3076, random_state=42)
remaining_ab = df_abnormal.drop(ab_train.index)

ab_test   = remaining_ab.sample(n=659, random_state=42)
ab_val    = remaining_ab.drop(ab_test.index).sample(n=659, random_state=42)

In [85]:
# -------------------------------
# Build final dataframes
# -------------------------------
train_df = pd.concat([normal_train, ab_train]).sample(frac=1, random_state=42)
test_df  = pd.concat([normal_test, ab_test]).sample(frac=1, random_state=42)
val_df   = pd.concat([normal_val, ab_val]).sample(frac=1, random_state=42)

# -------------------------------
# Save to CSV
# -------------------------------
train_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/train.csv", index=False)
test_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/test.csv", index=False)
val_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/validation.csv", index=False)

print("Saved train.csv, test.csv, validation.csv")

Saved train.csv, test.csv, validation.csv


In [72]:
# val_df[val_df['label'] == "Normal"]
val_df[val_df['label'] == "Abnormal"]

Unnamed: 0,image_id,class_name,label
5726,5bb68aea19f6eb00d28d0e6386afecfc,"Aortic enlargement, Consolidation, Lung Opacit...",Abnormal
5663,74292e695d6b5868b89acf26363ee93e,"Aortic enlargement, Cardiomegaly, Lung Opacity...",Abnormal
6570,7d746268923b15615c93334eff3a6baf,"ILD, Nodule/Mass",Abnormal
1789,c50bb66530634bfac9d5487a607e8211,"Lung Opacity, Pleural effusion",Abnormal
1730,18e91ea53a6b6829c70ae2d762605c21,"Aortic enlargement, Cardiomegaly",Abnormal
...,...,...,...
3784,aae8f5574784d4343ab50b4f0cef671d,"Aortic enlargement, Cardiomegaly",Abnormal
5271,7d3e79d32d233140b7e30880739a42c4,"Aortic enlargement, Cardiomegaly, Lung Opacity...",Abnormal
6471,23d1c67775a1f20404642b6086b74cc8,"Aortic enlargement, Calcification, Pleural eff...",Abnormal
3136,13088cbf40717bace59ef0961554c08f,"Other lesion, Pleural effusion, Pleural thicke...",Abnormal
