This file contains the code that will create TFrecords out of the jpeg files and these tfrecords would be fed into the tf model

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import shutil
from PIL import Image
import PIL

In [None]:
# Read the paths
BASE_PATH='../input/siim-covid19-detection'
TRAIN_CSV_PATH=os.path.join(BASE_PATH,'train_image_level.csv' )
TRAIN_STUDY_PATH=os.path.join(BASE_PATH,'train_study_level.csv')
TEST_CSV_PATH=os.path.join(BASE_PATH,'sample_submission.csv')

In [None]:
os.listdir(BASE_PATH)

In [None]:
# Read image level data
train_meta=pd.read_csv(TRAIN_CSV_PATH)
train_meta

In [None]:
# Read the meta data of the dicom files that were saved as jpeg
tr_meta2=pd.read_csv('../input/jpeg-files/meta.csv')
tr_meta2

In [None]:
# Remove the _image string from all image ids
train_meta.id=train_meta.id.str.replace("_image", "")
train_meta

In [None]:
# Read study level image data
study_level=pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')


# Remove the _study string from all the study ids
study_level.id=study_level.id.str.replace("_study","")
study_level

In [None]:
# The labels either have no opacity or have opacity. If the image has opacity, it can be on one lung
# or both lungs. 
def split_label(label):
    splits=label.split(' ')
    if len(splits)==6 and 'none' in label:
        return 0
    elif len(splits)==6 and 'opacity' in label:
        return 1
    else:
        return 2



train_meta['only_label']=train_meta.label.apply(lambda x: split_label(x))

In [None]:
# Merge the study level and image level information into one dataframe
new_df=pd.merge(train_meta, study_level, how='left', left_on="StudyInstanceUID", right_on="id")
new_df

In [None]:
# Drop unnecessary columns
new_df.drop(columns=['id_y'], axis=1, inplace=True)

In [None]:
# Save the image paths in the df
new_df['img_path']=new_df['StudyInstanceUID'].apply(lambda x: os.path.join(BASE_PATH,x))
new_df

In [None]:
# Merge the data dicom meta data with the study and image level data
new_df=pd.merge(new_df, tr_meta2, how='left', left_on="id_x", right_on="image_id")
new_df.drop(columns=['id_x'], axis=1, inplace=True)
new_df

In [None]:
# Move the bounding box coordinates into separate columns
new_df['xmin']=new_df.label.apply(lambda x: float(x.split(' ')[2]))
new_df['ymin']=new_df.label.apply(lambda x: float(x.split(' ')[3]))
new_df['xmax']=new_df.label.apply(lambda x: float(x.split(' ')[4]))
new_df['ymax']=new_df.label.apply(lambda x: float(x.split(' ')[5]))
new_df['label']=new_df.label.apply(lambda x: x.split(' ')[0])

# Remove the boxes column
new_df.drop(columns=['boxes'], axis=1,inplace=True)
new_df

In [None]:
# Creatae a mapping between the class name and label code
name2label = {'Typical Appearance': 3,
 'Indeterminate Appearance': 1,
 'Atypical Appearance': 2,
 'Negative for Pneumonia': 0}
# Get the class names
class_names = list(name2label.keys())
# a dictionary that maps the class code to the class names
label2name = {v:k for k, v in name2label.items()}
# Add new columns class name and class label that will reflect the classes of the desease.

new_df['class_name']  = new_df.apply(lambda row:row[class_names].iloc[[row[class_names].values.argmax()]].index.tolist()[0], axis=1)
new_df['class_label'] = new_df.class_name.map(name2label)
new_df.head()

In [None]:
new_df.columns

In [None]:
# Write TF Records

def _byte_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value=value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
        
    
def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

In [None]:
# Serialize the data
def serialize_example(values):
    feature={
        'image':_byte_feature(values[0]),
        'target':_int64_feature(values[3]),
        'StudyInstanceUID':_byte_feature(values[2]),
        'only_label':_int64_feature(values[3]),
        'Negative for Pneumonia':_int64_feature(values[4]),
        'Typical Appearance':_int64_feature(values[5]),
        'Indeterminate Appearance':_int64_feature(values[6]),
        'Atypical Appearance':_int64_feature(values[7]),
        'img_path':_byte_feature(values[8]),
        'image_id':_byte_feature(values[9]),
        'dim0':_int64_feature(values[10]),
        'dim1':_int64_feature(values[11]),
        'xmin':_float_feature(values[12]),
        'ymin':_float_feature(values[13]),
        'xmax':_float_feature(values[14]),
        'ymax':_float_feature(values[15])
    }
    example_proto=tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
# Get all the files in the train folder
trainset=os.listdir('../input/jpeg-files/train')


# Create the tfrecord for train set
def create_tfrecords(dataset, df):
    # Get the length of the dataset
    ds_size=len(dataset)
    print('train size', ds_size)
    
    # Total files that would be in a tfrecord
    files_count=422
    
    # Find the number of tfrecord to be created
    rec_size=ds_size//files_count+int(ds_size%files_count!=0)
    for rec in range(0,rec_size-1):
        print(f'Printing {rec} of {rec_size-1}')
        
        # ct2 find the remaining size of the files to be converted into tfrec
        ct2=min(files_count,ds_size-rec*files_count)
        with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(rec,ct2)) as writer:
            for k in range(ct2):
                
                # Read the jpeg files,read the images, and do some processing
                
                path=os.path.join('../input/jpeg-files/train',dataset[ct2*rec+k])
                
                img=cv2.imread(path)             
                
                img=cv2.cvtColor(img,cv2.COLOR_RGB2BGR)
                img=cv2.imencode('.jpg',img, (cv2.IMWRITE_JPEG_QUALITY,94))[1].tostring()
                
                name=dataset[ct2*rec+k].split('.')[0]
                
                row=df.loc[df.image_id==name]
                
                # Serialize the image file into a tfrecord
                example=serialize_example([
                                          img,
                                          np.array(row.class_label.values[0]),
                                          str.encode(row.StudyInstanceUID.values[0]),
                                          row.only_label.values[0],
                                          row['Negative for Pneumonia'].values[0],
                                          row['Typical Appearance'].values[0],
                                          row['Indeterminate Appearance'].values[0],
                                          row['Atypical Appearance'].values[0],
                                          str.encode(row.img_path.values[0]),
                                          str.encode(row.image_id.values[0]),
                                          row.dim0.values[0],
                                          row.dim1.values[0],
                                          row.xmin.values[0],
                                          row.ymin.values[0],
                                          row.xmax.values[0],
                                          row.ymax.values[0]])
                writer.write(example)
    
    
    
    


create_tfrecords(trainset, new_df)


Write Test TFRecords

In [None]:
# Define the sample for test set
def serialize_example_test(values):
    feature={
        'image':_byte_feature(values[0]),
        'id':_byte_feature(values[0]),       
        'PredictionString':_byte_feature(values[1])        
    }
    example_proto=tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
# Create TFrecprd file for the test set of jpeg files
testset=os.listdir('../input/jpeg-files/test')
def create_test_tfrecords(dataset, df):
    ds_size=len(dataset)
    print('test size', ds_size)
    files_count=252
    rec_size=ds_size//files_count+int(ds_size%files_count!=0)
    for rec in range(0,rec_size):
        print(f'Printing {rec} of {rec_size}')
        ct2=min(files_count,ds_size-rec*files_count)
        with tf.io.TFRecordWriter('test%.2i-%i.tfrec'%(rec,ct2)) as writer:
            for k in range(ct2):
                path=os.path.join('../input/jpeg-files/test',dataset[ct2*rec+k])
                img=cv2.imread(path)#Image.open(path)
                #print(img.shape)
                img=cv2.cvtColor(img,cv2.COLOR_RGB2BGR)
                img=cv2.imencode('.jpg',img, (cv2.IMWRITE_JPEG_QUALITY,94))[1].tostring()
                name=dataset[ct2*rec+k].split('.')[0]
                #img.save(name+'.jpeg')
                row=df.loc[df.id==name]
                #print(name)
                #print(str.encode(row.boxes.values[0]))
                example=serialize_example_test([img,str.encode(row.id.values[0]),
                                          str.encode(row.PredictionString.values[0])
                                          ])
                writer.write(example)
    #shutil.rmtree('./test')
    
create_test_tfrecords(testset, test_meta)