# Data EDA - Customized Image Generation

**Nguyễn Khang Hy (2352662)** - Team Lead

Phân tích và khám phá dữ liệu cho dự án Style Transfer với Stable Diffusion + LoRA.


## 1. Setup Environment


In [None]:
# Clone repository
!git clone https://github.com/HyIsNoob/customized-image-generation.git
%cd customized-image-generation

# Install dependencies
!pip install -r requirements.txt


In [None]:
# Import libraries
import sys
sys.path.append('/kaggle/working/customized-image-generation')

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from collections import Counter
import cv2

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## 2. Dataset Paths


In [None]:
# Kaggle dataset paths
COCO_DIR = Path('/kaggle/input/coco-2017-dataset/coco2017/train2017')
WIKIART_DIR = Path('/kaggle/input/wikiart/wikiart')

print(f"COCO directory exists: {COCO_DIR.exists()}")
print(f"WikiArt directory exists: {WIKIART_DIR.exists()}")


## 3. COCO Dataset Analysis


In [None]:
# Get all COCO images
coco_images = list(COCO_DIR.glob('*.jpg'))
print(f"Total COCO images: {len(coco_images)}")


In [None]:
# Analyze image sizes
def get_image_size(image_path):
    try:
        img = Image.open(image_path)
        return img.size
    except:
        return None

sample_size = min(1000, len(coco_images))
sample_images = np.random.choice(coco_images, sample_size, replace=False)

sizes = []
for img_path in sample_images:
    size = get_image_size(img_path)
    if size:
        sizes.append(size)

widths = [s[0] for s in sizes]
heights = [s[1] for s in sizes]
aspect_ratios = [w/h for w, h in zip(widths, heights)]


In [None]:
# Visualize COCO statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

axes[0, 0].hist(widths, bins=50, alpha=0.7, color='blue')
axes[0, 0].set_title('Width Distribution')
axes[0, 0].set_xlabel('Width (px)')
axes[0, 0].set_ylabel('Frequency')

axes[0, 1].hist(heights, bins=50, alpha=0.7, color='green')
axes[0, 1].set_title('Height Distribution')
axes[0, 1].set_xlabel('Height (px)')
axes[0, 1].set_ylabel('Frequency')

axes[1, 0].hist(aspect_ratios, bins=50, alpha=0.7, color='red')
axes[1, 0].set_title('Aspect Ratio Distribution')
axes[1, 0].set_xlabel('Aspect Ratio')
axes[1, 0].set_ylabel('Frequency')

axes[1, 1].scatter(widths, heights, alpha=0.3, s=10)
axes[1, 1].set_title('Width vs Height')
axes[1, 1].set_xlabel('Width (px)')
axes[1, 1].set_ylabel('Height (px)')

plt.tight_layout()
plt.savefig('/kaggle/working/coco_stats.png', dpi=150)
plt.show()

print(f"Mean width: {np.mean(widths):.0f}px")
print(f"Mean height: {np.mean(heights):.0f}px")
print(f"Mean aspect ratio: {np.mean(aspect_ratios):.2f}")


In [None]:
# Display sample COCO images
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
sample_imgs = np.random.choice(coco_images, 6, replace=False)

for ax, img_path in zip(axes.flat, sample_imgs):
    img = Image.open(img_path)
    ax.imshow(img)
    ax.set_title(f"{img.size[0]}x{img.size[1]}")
    ax.axis('off')

plt.tight_layout()
plt.savefig('/kaggle/working/coco_samples.png', dpi=150)
plt.show()


## 4. WikiArt Dataset Analysis


In [None]:
# Explore WikiArt structure
wikiart_subdirs = [d for d in WIKIART_DIR.iterdir() if d.is_dir()]
print(f"Number of style directories: {len(wikiart_subdirs)}")
print(f"Style directories: {[d.name for d in wikiart_subdirs[:10]]}")


In [None]:
# Count images per style
style_counts = {}
for style_dir in wikiart_subdirs:
    images = list(style_dir.glob('*.jpg')) + list(style_dir.glob('*.png'))
    style_counts[style_dir.name] = len(images)

style_df = pd.DataFrame(list(style_counts.items()), columns=['Style', 'Count'])
style_df = style_df.sort_values('Count', ascending=False)

print(style_df.head(20))


In [None]:
# Visualize style distribution
top_styles = style_df.head(15)
plt.figure(figsize=(12, 6))
plt.barh(top_styles['Style'], top_styles['Count'])
plt.xlabel('Number of Images')
plt.title('Top 15 Styles by Image Count')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('/kaggle/working/wikiart_style_distribution.png', dpi=150)
plt.show()


In [None]:
# Select styles for training (3-5 styles with 50-100 images each)
selected_styles = style_df[(style_df['Count'] >= 50) & (style_df['Count'] <= 200)].head(5)
print("Selected styles for training:")
print(selected_styles)


In [None]:
# Display sample WikiArt images from different styles
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
selected_style_dirs = [WIKIART_DIR / style for style in selected_styles['Style'].head(3).tolist()]

for row, style_dir in enumerate(selected_style_dirs):
    images = list(style_dir.glob('*.jpg'))[:3]
    for col, img_path in enumerate(images):
        img = Image.open(img_path)
        axes[row, col].imshow(img)
        axes[row, col].set_title(f"{style_dir.name}")
        axes[row, col].axis('off')

plt.tight_layout()
plt.savefig('/kaggle/working/wikiart_samples.png', dpi=150)
plt.show()


## 5. Summary Report


In [None]:
# Generate summary report
report = f"""
# Data EDA Report

## COCO Dataset
- Total images: {len(coco_images)}
- Mean size: {np.mean(widths):.0f}x{np.mean(heights):.0f}px
- Mean aspect ratio: {np.mean(aspect_ratios):.2f}

## WikiArt Dataset
- Total styles: {len(wikiart_subdirs)}
- Selected styles for training: {len(selected_styles)}
- Styles: {', '.join(selected_styles['Style'].head(5).tolist())}

## Recommendations
- Resize all images to 512x512 for training
- Use selected styles with 50-200 images each
- Consider data augmentation for style images
"""

print(report)

# Save report
with open('/kaggle/working/eda_report.md', 'w') as f:
    f.write(report)
