In [52]:
import pandas as pd
path= 'Original_csv.csv'
df= pd.read_csv(path)
# df

In [53]:
# 1. Drop unnecessary columns

df_temp = df.drop(columns=['class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max'])

# 2–4. Group by image_id and combine unique class names (case-insensitive)
df_new = (
    df_temp.assign(class_name_lower=df_temp['class_name'].str.lower())
           .groupby(['image_id'])
           .agg({
                'class_name': lambda x: ', '.join(
                    sorted({name.lower(): name for name in x}.values())
                )
           })
           .reset_index()
)

In [54]:
df_new['label'] = df_new['class_name'].apply(
    lambda x: "Normal" if x.strip().lower() == "no finding" else "Abnormal"
)
df_new

Unnamed: 0,image_id,class_name,label
0,000434271f63a053c4128a0ba6352c7f,No finding,Normal
1,00053190460d56c53cc3e57321387478,No finding,Normal
2,0005e8e3701dfb1dd93d53e2ff537b6e,"Consolidation, Infiltration, Lung Opacity, Nod...",Abnormal
3,0006e0a85696f6bb578e84fafa9a5607,No finding,Normal
4,0007d316f756b3fa0baea2ff514ce945,"Aortic enlargement, Cardiomegaly, ILD, Pleural...",Abnormal
...,...,...,...
14995,ffe6f9fe648a7ec29a50feb92d6c15a4,"Aortic enlargement, Cardiomegaly, Other lesion",Abnormal
14996,ffea246f04196af602c7dc123e5e48fc,No finding,Normal
14997,ffeffc54594debf3716d6fcd2402a99f,Aortic enlargement,Abnormal
14998,fff0f82159f9083f3dd1f8967fc54f6a,No finding,Normal


In [55]:
import numpy as np
df_new[df_new['label']=='Abnormal']

Unnamed: 0,image_id,class_name,label
2,0005e8e3701dfb1dd93d53e2ff537b6e,"Consolidation, Infiltration, Lung Opacity, Nod...",Abnormal
4,0007d316f756b3fa0baea2ff514ce945,"Aortic enlargement, Cardiomegaly, ILD, Pleural...",Abnormal
6,000d68e42b71d3eac10ccc077aba07c1,"Aortic enlargement, Lung Opacity, Other lesion...",Abnormal
7,00150343289f317a0ad5629d5b7d9ef9,"Aortic enlargement, Cardiomegaly, Lung Opacity...",Abnormal
9,001d127bad87592efe45a5c7678f8b8d,"Calcification, Pulmonary fibrosis",Abnormal
...,...,...,...
14986,ff924bcbd38f123aec723aa7040d7e43,"Atelectasis, Consolidation, Lung Opacity, Pleu...",Abnormal
14987,ffb5d0b005261ed350f7a08c06613a34,"Aortic enlargement, Cardiomegaly",Abnormal
14991,ffceb71a80efba3b83c88e11f4b9694b,"Cardiomegaly, Pulmonary fibrosis",Abnormal
14995,ffe6f9fe648a7ec29a50feb92d6c15a4,"Aortic enlargement, Cardiomegaly, Other lesion",Abnormal


# selected dataset

In [74]:
# Separate Normal and Abnormal cases
df_normal = df_new[df_new['label'] == "Normal"]
df_abnormal = df_new[df_new['label'] == "Abnormal"]

# Randomly select 4500 Normal rows
df_normal_sampled = df_normal.sample(n=4500, random_state=42)

# Combine all Abnormal + sampled Normal
df_final = pd.concat([df_normal_sampled, df_abnormal], ignore_index=True)

# Shuffle the final dataset
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Add ".dicom" extension to image_id column
df_final['image_id'] = df_final['image_id'].astype(str) + ".dicom"

In [78]:
df_final

Unnamed: 0,image_id,class_name,label
0,1245f853816564c0ee3f1b9e93521ff0.dicom,"Aortic enlargement, Infiltration, Other lesion...",Abnormal
1,19d8fca6a7a5bbc35dc669ef9406ba99.dicom,No finding,Normal
2,52fe2f01573413223b1f7edee17de341.dicom,Pulmonary fibrosis,Abnormal
3,59ab79bb1fce73184425a6572169999e.dicom,No finding,Normal
4,75b4afff533ef3bbced8fdae06ac6511.dicom,No finding,Normal
...,...,...,...
8889,47322c3b3510df3e395773eca7e06dc8.dicom,"Cardiomegaly, Other lesion",Abnormal
8890,285cdfec4d725d327266833adf3ba9d5.dicom,"Aortic enlargement, Cardiomegaly",Abnormal
8891,332f505a735ca0961e7128fc0f166a5c.dicom,"Aortic enlargement, Cardiomegaly",Abnormal
8892,b6b5e7a80a8caf01b7721d63ae0551be.dicom,No finding,Normal


In [79]:
df_final.to_csv('/home/jupyter-nafisha/X-ray/CSVs/Selected_all.csv', index=False)

# train, val and test split

In [80]:
import pandas as pd
df = pd.read_csv("/home/jupyter-nafisha/X-ray/CSVs/Selected_all.csv")

In [81]:
# Split into normal and abnormal
df_normal = df[df["label"] == "Normal"]
df_abnormal = df[df["label"] == "Abnormal"]

print("Normal:", len(df_normal))
print("Abnormal:", len(df_abnormal))

Normal: 4500
Abnormal: 4394


In [82]:
# -------------------------------
# Normal splits
# -------------------------------
normal_train  = df_normal.sample(n=3150, random_state=42)
remaining_normal = df_normal.drop(normal_train.index)

normal_test   = remaining_normal.sample(n=675, random_state=42)
normal_val    = remaining_normal.drop(normal_test.index).sample(n=675, random_state=42)

In [83]:
# -------------------------------
# Abnormal splits
# -------------------------------
ab_train  = df_abnormal.sample(n=3076, random_state=42)
remaining_ab = df_abnormal.drop(ab_train.index)

ab_test   = remaining_ab.sample(n=659, random_state=42)
ab_val    = remaining_ab.drop(ab_test.index).sample(n=659, random_state=42)

In [85]:
# -------------------------------
# Build final dataframes
# -------------------------------
train_df = pd.concat([normal_train, ab_train]).sample(frac=1, random_state=42)
test_df  = pd.concat([normal_test, ab_test]).sample(frac=1, random_state=42)
val_df   = pd.concat([normal_val, ab_val]).sample(frac=1, random_state=42)

# -------------------------------
# Save to CSV
# -------------------------------
train_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/train.csv", index=False)
test_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/test.csv", index=False)
val_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/validation.csv", index=False)

print("Saved train.csv, test.csv, validation.csv")

Saved train.csv, test.csv, validation.csv


In [72]:
# val_df[val_df['label'] == "Normal"]
val_df[val_df['label'] == "Abnormal"]

Unnamed: 0,image_id,class_name,label
5726,5bb68aea19f6eb00d28d0e6386afecfc,"Aortic enlargement, Consolidation, Lung Opacit...",Abnormal
5663,74292e695d6b5868b89acf26363ee93e,"Aortic enlargement, Cardiomegaly, Lung Opacity...",Abnormal
6570,7d746268923b15615c93334eff3a6baf,"ILD, Nodule/Mass",Abnormal
1789,c50bb66530634bfac9d5487a607e8211,"Lung Opacity, Pleural effusion",Abnormal
1730,18e91ea53a6b6829c70ae2d762605c21,"Aortic enlargement, Cardiomegaly",Abnormal
...,...,...,...
3784,aae8f5574784d4343ab50b4f0cef671d,"Aortic enlargement, Cardiomegaly",Abnormal
5271,7d3e79d32d233140b7e30880739a42c4,"Aortic enlargement, Cardiomegaly, Lung Opacity...",Abnormal
6471,23d1c67775a1f20404642b6086b74cc8,"Aortic enlargement, Calcification, Pleural eff...",Abnormal
3136,13088cbf40717bace59ef0961554c08f,"Other lesion, Pleural effusion, Pleural thicke...",Abnormal


# chexpert CSV for AP inference

In [121]:
path= '/home/jupyter-nafisha/X-ray/CSVs/chexpert_orig.csv'

In [122]:
import pandas as pd
data= pd.read_csv(path)

In [123]:
# only frontal x ray
data = data[data['Frontal/Lateral']=='Frontal']

In [124]:
# only AP orientation
data = data[data['AP/PA']=='AP']

In [125]:
# no support device as training data has very few supporting devices
data = data[data['Support Devices']==0]

In [126]:
data= data.drop(columns=['Support Devices'])

In [127]:
import pandas as pd
import numpy as np

columns = [
    'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
    'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
    'Pneumonia', 'Atelectasis', 'Pneumothorax',
    'Pleural Effusion', 'Pleural Other', 'Fracture'
]

data['class_name'] = (
    data[columns]
    .eq(1)
    .apply(lambda row: ', '.join(row.index[row]), axis=1)
)


In [128]:
filtered_data = data[['Path', 'class_name']]

In [129]:
filtered_data['label'] = np.where(
    filtered_data['class_name'] == 'No Finding',
    'Normal',
    'Abnormal'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['label'] = np.where(


In [130]:
filtered_data = filtered_data[
    filtered_data['class_name'].notna() &
    (filtered_data['class_name'].str.strip() != '')
]

In [133]:
filtered_data

Unnamed: 0,Path,class_name,label
54,patient00019/study4/view1_frontal.jpg,"Cardiomegaly, Lung Opacity, Lung Lesion, Atele...",Abnormal
56,patient00019/study2/view1_frontal.jpg,"Lung Opacity, Lung Lesion",Abnormal
119,patient00039/study3/view1_frontal.jpg,"Atelectasis, Pneumothorax, Fracture",Abnormal
156,patient00048/study1/view1_frontal.jpg,Pneumothorax,Abnormal
194,patient00062/study2/view1_frontal.jpg,"Atelectasis, Pleural Effusion",Abnormal
...,...,...,...
223233,patient64381/study1/view1_frontal.jpg,Pleural Effusion,Abnormal
223269,patient64414/study1/view1_frontal.jpg,"Lung Opacity, Pleural Effusion",Abnormal
223375,patient64507/study1/view1_frontal.jpg,"Lung Opacity, Pneumothorax, Pleural Effusion",Abnormal
223391,patient64522/study1/view1_frontal.jpg,No Finding,Normal


In [132]:
filtered_data['Path'] = filtered_data['Path'].str.replace('CheXpert-v1.0-small/train/', '')

In [135]:
filtered_data[filtered_data['label']=='Abnormal']

Unnamed: 0,Path,class_name,label
54,patient00019/study4/view1_frontal.jpg,"Cardiomegaly, Lung Opacity, Lung Lesion, Atele...",Abnormal
56,patient00019/study2/view1_frontal.jpg,"Lung Opacity, Lung Lesion",Abnormal
119,patient00039/study3/view1_frontal.jpg,"Atelectasis, Pneumothorax, Fracture",Abnormal
156,patient00048/study1/view1_frontal.jpg,Pneumothorax,Abnormal
194,patient00062/study2/view1_frontal.jpg,"Atelectasis, Pleural Effusion",Abnormal
...,...,...,...
223203,patient64353/study1/view1_frontal.jpg,"Edema, Pleural Effusion",Abnormal
223233,patient64381/study1/view1_frontal.jpg,Pleural Effusion,Abnormal
223269,patient64414/study1/view1_frontal.jpg,"Lung Opacity, Pleural Effusion",Abnormal
223375,patient64507/study1/view1_frontal.jpg,"Lung Opacity, Pneumothorax, Pleural Effusion",Abnormal


In [136]:
filtered_data.to_csv('/home/jupyter-nafisha/X-ray/CSVs/chexpert_AP.csv', index= False)

# chexpert training

In [74]:
path= '/home/jupyter-nafisha/X-ray/CSVs/chexpert_orig.csv'

In [75]:
import pandas as pd
data= pd.read_csv(path)

In [76]:
# only frontal x ray
data = data[data['Frontal/Lateral']=='Frontal']

In [77]:
# only AP orientation
data = data[data['AP/PA']=='PA']

In [78]:
import pandas as pd
import numpy as np

columns = [
    'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
    'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
    'Pneumonia', 'Atelectasis', 'Pneumothorax',
    'Pleural Effusion', 'Pleural Other', 'Fracture'
]

data['class_name'] = (
    data[columns]
    .eq(1)
    .apply(lambda row: ', '.join(row.index[row]), axis=1)
)

In [84]:
data[data['class_name'] == '']
# data

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,class_name
20,CheXpert-v1.0-small/train/patient00011/study13...,Female,22,Frontal,PA,,0.0,,,,,0.0,,0.0,0.0,0.0,,,,
36,CheXpert-v1.0-small/train/patient00012/study2/...,Female,55,Frontal,PA,,-1.0,-1.0,0.0,,,0.0,,,,0.0,,,1.0,
38,CheXpert-v1.0-small/train/patient00012/study1/...,Female,55,Frontal,PA,,0.0,,,,,,,,,0.0,,,1.0,
42,CheXpert-v1.0-small/train/patient00014/study1/...,Female,43,Frontal,PA,,-1.0,,0.0,,,,,,0.0,0.0,,,,
93,CheXpert-v1.0-small/train/patient00028/study1/...,Male,70,Frontal,PA,,0.0,,-1.0,,,0.0,,-1.0,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143932,CheXpert-v1.0-small/train/patient34586/study2/...,Male,81,Frontal,PA,,0.0,,,-1.0,,-1.0,,,0.0,,,,,
143944,CheXpert-v1.0-small/train/patient34589/study1/...,Female,77,Frontal,PA,,-1.0,,,,0.0,0.0,,,0.0,0.0,,,,
143948,CheXpert-v1.0-small/train/patient34590/study1/...,Male,79,Frontal,PA,,,,-1.0,,,0.0,,-1.0,,,,,,
143953,CheXpert-v1.0-small/train/patient34592/study1/...,Female,68,Frontal,PA,,0.0,0.0,,,,0.0,,,,0.0,,,,


In [94]:
filtered_data = data[['Path', 'Support Devices', 'class_name']]

In [95]:
filtered_data['label'] = np.where(
    filtered_data['class_name'] == 'No Finding',
    'Normal',
    'Abnormal'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['label'] = np.where(


In [97]:
filtered_data = filtered_data[
    filtered_data['class_name'].notna() &
    (filtered_data['class_name'].str.strip() != '')
]

In [109]:
normal_df = filtered_data[filtered_data['label'] == 'Normal']
abnormal_df = filtered_data[filtered_data['label'] == 'Abnormal']

In [110]:
abnormal_df = abnormal_df[abnormal_df['Support Devices'] == 1]

In [111]:
balanced_df = pd.concat([normal_df, abnormal_df]) \
                 .sample(frac=1, random_state=42) \
                 .reset_index(drop=True)

In [118]:
balanced_df

Unnamed: 0,Path,Support Devices,class_name,label
0,patient23492/study1/view1_frontal.jpg,,No Finding,Normal
1,patient04539/study1/view1_frontal.jpg,1.0,"Lung Opacity, Lung Lesion",Abnormal
2,patient05896/study1/view1_frontal.jpg,,No Finding,Normal
3,patient00901/study1/view1_frontal.jpg,,No Finding,Normal
4,patient16014/study8/view1_frontal.jpg,1.0,"Lung Opacity, Atelectasis, Pleural Other",Abnormal
...,...,...,...,...
11161,patient01622/study2/view1_frontal.jpg,1.0,Cardiomegaly,Abnormal
11162,patient32632/study2/view1_frontal.jpg,1.0,No Finding,Normal
11163,patient33895/study1/view1_frontal.jpg,,No Finding,Normal
11164,patient05453/study1/view2_frontal.jpg,,No Finding,Normal


In [117]:
balanced_df['Path'] = balanced_df['Path'].str.replace('CheXpert-v1.0-small/train/', '')

In [120]:
balanced_df.to_csv('/home/jupyter-nafisha/X-ray/CSVs/chexpert_PA.csv', index= False)

### train test and validation split

In [158]:
import pandas as pd
df = pd.read_csv('/home/jupyter-nafisha/X-ray/CSVs/chexpert_PA.csv')

In [147]:
df= df.drop(columns=['Support Devices'])

In [160]:
df = df.rename(columns={'Path': 'image_id'})

In [151]:
# Split into normal and abnormal
df_normal = df[df["label"] == "Normal"]
df_abnormal = df[df["label"] == "Abnormal"]

print("Normal:", len(df_normal))
print("Abnormal:", len(df_abnormal))

Normal: 5499
Abnormal: 5667


In [152]:
# -------------------------------
# Normal splits
# -------------------------------
normal_train  = df_normal.sample(n=3849, random_state=42)
remaining_normal = df_normal.drop(normal_train.index)

normal_test   = remaining_normal.sample(n=825, random_state=42)
normal_val    = remaining_normal.drop(normal_test.index).sample(n=825, random_state=42)

In [153]:
# -------------------------------
# Abnormal splits
# -------------------------------
ab_train  = df_abnormal.sample(n=3967, random_state=42)
remaining_ab = df_abnormal.drop(ab_train.index)

ab_test   = remaining_ab.sample(n=850, random_state=42)
ab_val    = remaining_ab.drop(ab_test.index).sample(n=850, random_state=42)

In [157]:
# -------------------------------
# Build final dataframes
# -------------------------------
train_df = pd.concat([normal_train, ab_train]).sample(frac=1, random_state=42)
test_df  = pd.concat([normal_test, ab_test]).sample(frac=1, random_state=42)
val_df   = pd.concat([normal_val, ab_val]).sample(frac=1, random_state=42)

# -------------------------------
# Save to CSV
# -------------------------------
train_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/train_chex.csv", index=False)
test_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/test_chex.csv", index=False)
val_df.to_csv("/home/jupyter-nafisha/X-ray/CSVs/validation_chex.csv", index=False)

print("Saved train.csv, test.csv, validation.csv")

Saved train.csv, test.csv, validation.csv


### combining CSVs

In [222]:
# Train CSV

chex_train = '/home/jupyter-nafisha/X-ray/CSVs/train_chex.csv'
vin_train= '/home/jupyter-nafisha/X-ray/CSVs/train.csv'

chex= pd.read_csv(chex_train)
vin= pd.read_csv(vin_train)

chex['image_id'] = 'chexpert_PA/' + chex['image_id']
vin['image_id'] = 'Xray-Data/' + vin['image_id']

chex.to_csv(chex_train, index=False)
vin.to_csv(vin_train, index=False)

combined_df = pd.concat([chex, vin], ignore_index=True)
combined_df.to_csv('/home/jupyter-nafisha/X-ray/CSVs/train_combined.csv', index=False)

In [258]:
# Validation CSV

chex_valid = '/home/jupyter-nafisha/X-ray/CSVs/validation_chex.csv'
vin_valid= '/home/jupyter-nafisha/X-ray/CSVs/validation.csv'

chex= pd.read_csv(chex_valid)
vin= pd.read_csv(vin_valid)

chex['image_id'] = 'chexpert_PA/' + chex['image_id']
vin['image_id'] = 'Xray-Data/' + vin['image_id']

chex.to_csv(chex_valid, index=False)
vin.to_csv(vin_valid, index=False)

combined_df = pd.concat([chex, vin], ignore_index=True)
combined_df.to_csv('/home/jupyter-nafisha/X-ray/CSVs/valid_combined.csv', index=False)

In [247]:
# Test CSV

chex_test = '/home/jupyter-nafisha/X-ray/CSVs/test_chex.csv'
vin_test= '/home/jupyter-nafisha/X-ray/CSVs/test.csv'

chex= pd.read_csv(chex_test)
vin= pd.read_csv(vin_test)

chex['image_id'] = 'chexpert_PA/' + chex['image_id']
vin['image_id'] = 'Xray-Data/' + vin['image_id']

chex.to_csv(chex_test, index=False)
vin.to_csv(vin_test, index=False)

combined_df = pd.concat([chex, vin], ignore_index=True)
combined_df.to_csv('/home/jupyter-nafisha/X-ray/CSVs/test_combined.csv', index=False)

In [268]:
path= '/home/jupyter-nafisha/X-ray/CSVs/train_combined.csv'
path= '/home/jupyter-nafisha/X-ray/CSVs/valid_combined.csv'
path= '/home/jupyter-nafisha/X-ray/CSVs/test_combined.csv'

count=0
data= pd.read_csv(path)
for _, row in data.iterrows():
    new_path= os.path.join('/home/common', row['image_id'])
    if not os.path.exists(new_path):
        count+=1
        # print(new_path)
count

0

In [269]:
len(data[data['label']=='Normal']), len(data[data['label']=='Abnormal'])

(1500, 1509)

# NIH testing data

In [270]:
import pandas as pd
path= '/home/jupyter-nafisha/X-ray/CSVs/NIH_orig.csv'
data = pd.read_csv(path)
data

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,
...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168,
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168,
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168,


In [272]:
data= data[data['View Position']=='PA']

In [273]:
data

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,
...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168,
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168,
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168,


In [275]:
filtered_data= data[['Image Index', 'Finding Labels']]

In [292]:
filtered_data = filtered_data.rename(columns={
    'Image Index': 'image_id', 
    'Finding Labels': 'class_name'                     
})

In [294]:
filtered_data['label'] = np.where(
    filtered_data['class_name'] == 'No Finding',
    'Normal',
    'Abnormal'
)

In [303]:
normal_df = filtered_data[filtered_data['label']=='Normal'].iloc[0:500]
abnormal_df = filtered_data[filtered_data['label']=='Abnormal'].iloc[0:500]

In [305]:
normal_df

Unnamed: 0,image_id,class_name,label
3,00000002_000.png,No Finding,Normal
13,00000005_000.png,No Finding,Normal
16,00000005_003.png,No Finding,Normal
17,00000005_004.png,No Finding,Normal
18,00000005_005.png,No Finding,Normal
...,...,...,...
1414,00000370_005.png,No Finding,Normal
1416,00000370_007.png,No Finding,Normal
1418,00000370_009.png,No Finding,Normal
1421,00000371_000.png,No Finding,Normal


In [309]:
combined_testSet = (
    pd.concat([normal_df, abnormal_df], ignore_index=True)
      .sample(frac=1, random_state=42)
      .reset_index(drop=True)
)

In [310]:
combined_testSet

Unnamed: 0,image_id,class_name,label
0,00000013_020.png,Pneumothorax,Abnormal
1,00000174_005.png,Nodule,Abnormal
2,00000178_000.png,Infiltration,Abnormal
3,00000105_004.png,Nodule,Abnormal
4,00000318_001.png,No Finding,Normal
...,...,...,...
995,00000100_001.png,No Finding,Normal
996,00000223_000.png,No Finding,Normal
997,00000277_001.png,Infiltration,Abnormal
998,00000327_001.png,No Finding,Normal


In [311]:
combined_testSet.to_csv('/home/jupyter-nafisha/X-ray/CSVs/NIH_test.csv', index= False)

In [314]:
len(os.listdir('/home/jupyter-nafisha/X-ray/Inference_data/NIH-test-dataset'))

1000