# Streetcare Dataset - Image Organization by Quarter
This notebook processes the streetcare-drift-dataset-2021-2025.csv file, filters date-time images, and organizes them into Q1 and Q3 folders.

## Section 1: Import Required Libraries

In [None]:
import pandas as pd
import os
import shutil
from datetime import datetime
from pathlib import Path

print("Libraries imported successfully!")

## Section 2: Load and Explore the Dataset

In [None]:
# Define the path to the CSV file
csv_path = r"data\metadata\streetcare-drift-dataset-2021-2025.csv"

# Load the dataset
df = pd.read_csv(csv_path)

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names:\n{df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

## Section 3: Filter Date-Time Images

In [None]:
# Parse the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Filter out rows with invalid datetime (NaT values)
df_valid = df[df['date'].notna()].copy()

print(f"Total records: {len(df)}")
print(f"Records with valid date-time: {len(df_valid)}")
print(f"Records filtered out: {len(df) - len(df_valid)}")

# Extract quarter information from the 'date' column
df_valid['quarter'] = df_valid['date'].dt.quarter
df_valid['year'] = df_valid['date'].dt.year

print(f"\nQuarter distribution:")
print(df_valid['quarter'].value_counts().sort_index())
print(f"\nYear distribution:")
print(df_valid['year'].value_counts().sort_index())

## Section 4: Create Quarter-Based Folders

In [None]:
# Define base output directory
output_base = "data/organized_images"

# Create folders for Q1 and Q3
q1_folder = os.path.join(output_base, "Q1")
q3_folder = os.path.join(output_base, "Q3")

# Create directories if they don't exist
os.makedirs(q1_folder, exist_ok=True)
os.makedirs(q3_folder, exist_ok=True)

print(f"Created/verified folders:")
print(f"Q1 folder: {os.path.abspath(q1_folder)}")
print(f"Q3 folder: {os.path.abspath(q3_folder)}")

## Section 5: Organize Images by Quarter

In [None]:
# Filter for Q1 and Q3 only
df_q1q3 = df_valid[df_valid['quarter'].isin([1, 3])].copy()

print(f"Total records for Q1 and Q3: {len(df_q1q3)}")
print(f"Q1 records: {len(df_q1q3[df_q1q3['quarter'] == 1])}")
print(f"Q3 records: {len(df_q1q3[df_q1q3['quarter'] == 3])}")

# Track processed images
q1_count = 0
q3_count = 0
missing_count = 0

# Iterate through records and organize images
for idx, row in df_q1q3.iterrows():
    image_name = row['image_name']
    quarter = row['quarter']
    
    # Define source path - adjust based on where your images are stored
    # Assuming images might be in data/ folder or a similar location
    source_paths = [
        os.path.join('data', image_name),
        image_name,
        os.path.join('data', 'images', image_name)
    ]
    
    source_path = None
    for path in source_paths:
        if os.path.exists(path):
            source_path = path
            break
    
    if source_path is None:
        missing_count += 1
        continue
    
    # Determine destination folder
    if quarter == 1:
        dest_folder = q1_folder
        q1_count += 1
    elif quarter == 3:
        dest_folder = q3_folder
        q3_count += 1
    else:
        continue
    
    # Copy image to destination folder
    dest_path = os.path.join(dest_folder, image_name)
    try:
        shutil.copy2(source_path, dest_path)
    except Exception as e:
        print(f"Error copying {image_name}: {e}")

print(f"\n--- Organization Summary ---")
print(f"Q1 images copied: {q1_count}")
print(f"Q3 images copied: {q3_count}")
print(f"Images not found: {missing_count}")

## Section 6: Verify Organization Results

In [None]:
# Verify the organization results
print("=== VERIFICATION RESULTS ===\n")

# Check Q1 folder
q1_files = os.listdir(q1_folder) if os.path.exists(q1_folder) else []
print(f"Q1 Folder - Total files: {len(q1_files)}")
if len(q1_files) > 0:
    print(f"  Sample files: {q1_files[:5]}")

# Check Q3 folder
q3_files = os.listdir(q3_folder) if os.path.exists(q3_folder) else []
print(f"\nQ3 Folder - Total files: {len(q3_files)}")
if len(q3_files) > 0:
    print(f"  Sample files: {q3_files[:5]}")

# Summary statistics
total_organized = len(q1_files) + len(q3_files)
print(f"\n=== SUMMARY ===")
print(f"Total images organized: {total_organized}")
print(f"Q1 images: {len(q1_files)} ({100*len(q1_files)/max(total_organized,1):.1f}%)")
print(f"Q3 images: {len(q3_files)} ({100*len(q3_files)/max(total_organized,1):.1f}%)")

# Display quarter-year breakdown for organized data
print(f"\n=== QUARTER-YEAR BREAKDOWN ===")
q1q3_summary = df_q1q3.groupby(['year', 'quarter']).size().reset_index(name='count')
print(q1q3_summary.to_string(index=False))