In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import shutil

In [20]:
# Paths to folders and CSV files
train_folder = "/Users/giakhanh/Desktop/Deep Learning/Project/original_data/Train"
train_csv = "/Users/giakhanh/Desktop/Deep Learning/Project/original_data/Train.csv"
test_folder = "/Users/giakhanh/Desktop/Deep Learning/Project/original_data/Test"
test_csv = "/Users/giakhanh/Desktop/Deep Learning/Project/original_data/Test.csv"

# Load train and test CSV files
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

# Add folder paths to the dataframes
train_df['full_path'] = train_df['Path'].apply(lambda x: os.path.join(train_folder, os.path.relpath(x, 'Train')))
test_df['full_path'] = test_df['Path'].apply(lambda x: os.path.join(test_folder, os.path.relpath(x, 'Test')))

In [22]:
# Combine train and test dataframes
combined_df = pd.concat([train_df, test_df], ignore_index=True)

In [23]:
# Stratified splitting
train_val_df, test_df = train_test_split(
    combined_df, 
    test_size=0.2,  # 20% for the test set
    random_state=42, 
    stratify=combined_df['ClassId']
)

train_df, val_df = train_test_split(
    train_val_df, 
    test_size=0.125,  # 0.125 x 80% = 10%
    random_state=42, 
    stratify=train_val_df['ClassId']
)

# Print dataset sizes
print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Validation set size: {len(val_df)}")

Train set size: 36287
Test set size: 10368
Validation set size: 5184


In [24]:
# Save CSVs
base_output_path = "/Users/giakhanh/Desktop/Deep Learning/Project/official_data"

train_csv_path = os.path.join(base_output_path, "train.csv")
val_csv_path = os.path.join(base_output_path, "validation.csv")
test_csv_path = os.path.join(base_output_path, "test.csv")

train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

print(f"CSV files saved:\nTrain: {train_csv_path}\nValidation: {val_csv_path}\nTest: {test_csv_path}")

CSV files saved:
Train: /Users/giakhanh/Desktop/Deep Learning/Project/official_data/train.csv
Validation: /Users/giakhanh/Desktop/Deep Learning/Project/official_data/validation.csv
Test: /Users/giakhanh/Desktop/Deep Learning/Project/official_data/test.csv


In [25]:
# Create flat folders for train, validation, and test
train_output_folder = os.path.join(base_output_path, "train")
val_output_folder = os.path.join(base_output_path, "validation")
test_output_folder = os.path.join(base_output_path, "test")

os.makedirs(train_output_folder, exist_ok=True)
os.makedirs(val_output_folder, exist_ok=True)
os.makedirs(test_output_folder, exist_ok=True)

# Function to copy files to flat folders
def copy_files_flat(df, output_folder):
    for _, row in df.iterrows():
        src_path = row['full_path']
        file_name = os.path.basename(src_path)
        dst_path = os.path.join(output_folder, file_name)
        shutil.copy(src_path, dst_path)

# Copy files
copy_files_flat(train_df, train_output_folder)
copy_files_flat(val_df, val_output_folder)
copy_files_flat(test_df, test_output_folder)

print("Data successfully split and copied to flat folders.")

Data successfully split and copied to flat folders.


In [26]:
# Update the Path and full_path in the new CSV files
def update_paths(df, new_folder_name, base_output_path):
    updated_df = df.copy()
    updated_df['Path'] = updated_df['Path'].apply(lambda x: os.path.join(new_folder_name, os.path.basename(x)))
    updated_df['full_path'] = updated_df['Path'].apply(lambda x: os.path.join(base_output_path, x))
    return updated_df

# Update paths for train, validation, and test
train_df_updated = update_paths(train_df, "train", base_output_path)
val_df_updated = update_paths(val_df, "validation", base_output_path)
test_df_updated = update_paths(test_df, "test", base_output_path)

# Save updated CSVs
train_df_updated.to_csv(train_csv_path, index=False)
val_df_updated.to_csv(val_csv_path, index=False)
test_df_updated.to_csv(test_csv_path, index=False)

print("Paths in CSV files have been updated.")

Paths in CSV files have been updated.


In [27]:
# Remove the full_path column
def remove_full_path(df):
    return df.drop(columns=['full_path'])

# Remove full_path for train, validation, and test DataFrames
train_df_updated = remove_full_path(train_df_updated)
val_df_updated = remove_full_path(val_df_updated)
test_df_updated = remove_full_path(test_df_updated)

# Save updated CSVs without the full_path column
train_df_updated.to_csv(train_csv_path, index=False)
val_df_updated.to_csv(val_csv_path, index=False)
test_df_updated.to_csv(test_csv_path, index=False)

print("Full_path column removed from CSV files.")

Full_path column removed from CSV files.
