In [10]:
import pandas as pd
from io import StringIO
import os
import numpy as np

In [5]:
sample_train_csv = """
image_id,label
1000015157.jpg,0
1000201771.jpg,3
100042118.jpg,1
1000723321.jpg,1
1000812911.jpg,3
1000837476.jpg,3
1000910826.jpg,2
1001320321.jpg,0
1001723730.jpg,4
1001742395.jpg,3
1001749118.jpg,3
100204014.jpg,3
1002088496.jpg,1
1002255315.jpg,3
1002394761.jpg,3
1003218714.jpg,2
1003298598.jpg,3
1003442061.jpg,4
1003888281.jpg,0
"""

sample_train_df = pd.read_csv(StringIO(sample_train_csv))
print(f"sample_train_df : {sample_train_df.shape} | {sample_train_df.columns.tolist()}")
sample_train_df.head(3)

sample_train_df : (19, 2) | ['image_id', 'label']


Unnamed: 0,image_id,label
0,1000015157.jpg,0
1,1000201771.jpg,3
2,100042118.jpg,1


In [24]:
metadata = sample_train_df.copy()
metadata['image_path'] = metadata['image_id'].apply(lambda image_id : os.path.join("images", image_id))
metadata['image_id'] = metadata['image_id'].apply(lambda image_id : image_id.split('.jpg')[0])


# add 'source' column, generated categorical metadata 
metadata['source'] = np.random.choice(['A', 'B', 'C'], size=metadata.shape[0])

# add 'discrepancy' column, generated scalar metadata
metadata['discrepancy'] = np.random.uniform(low=0.0, high=1.0, size=metadata.shape[0])

metadata.head(3)

Unnamed: 0,image_id,label,image_path,source,discrepancy
0,1000015157,0,images/1000015157.jpg,A,0.760717
1,1000201771,3,images/1000201771.jpg,C,0.343488
2,100042118,1,images/100042118.jpg,C,0.014685


In [25]:
print(metadata['label'].value_counts().to_dict())
print(metadata['source'].value_counts().to_dict())

{3: 9, 0: 3, 1: 3, 2: 2, 4: 2}
{'C': 10, 'B': 5, 'A': 4}


In [26]:
columns = ['image_id', 'label', 'source', 'discrepancy', 'image_path']
metadata.to_csv("../data/metadata.csv", index=False, header=True, columns=columns)