In [10]:
""" 
Importing necessary libraries
"""
import pandas as pd
import os
import shutil

In [None]:
""" 
Reading the original df
"""
df = pd.read_csv("train.csv")

' \nReading the original df\n'

In [None]:
df.head()

In [None]:
""" 
As per the dataset:
 - Label 0 => Human Generated Image
 - Label 1 => AI Generated Image
"""

label_0 = df[df['label'] == 0]
label_1 = df[df['label'] == 1]

In [7]:
""" 
Out of 80,000 original image dataset, we are planning to use 10,000 images only due to the GPU constraints 
"""
selected_size = 10000

In [None]:
""" 
Inorder to tackle the data imbalance issues: 
 - 60% of images are labelled as 0
 - 40% of images are lablelled as 1 
"""
subset_label_0 = label_0.sample(
    n=int(selected_size*0.6), random_state=42)  # 60% of 1000
subset_label_1 = label_1.sample(
    n=int(selected_size*0.4), random_state=42)  # 40% of 1000

In [None]:
# Combine into a single DataFrame
subset_df = pd.concat([subset_label_0, subset_label_1]).sample(
    frac=1, random_state=42)

In [None]:
# Create subset directory
subset_dir = 'Dataset/subset_train_data'
os.makedirs(subset_dir, exist_ok=True)

# Copy images to subset directory
for filename in subset_df['file_name']:
    src_path = filename
    dst_path = os.path.join(subset_dir, os.path.basename(filename))
    shutil.copy(src_path, dst_path)

In [None]:
# Update filenames to point to the subset directory
subset_df['file_name'] = subset_df['file_name'].apply(
    lambda x: os.path.join(subset_dir, os.path.basename(x))
)

In [None]:
# Save subset CSV
subset_df.to_csv('subset_train.csv', index=False)