# Laboratory Exercise 2: Data Cleaning and Preprocessing

**Name:** Lorenzo Bela, Robert Callorina, Kean Guzon

**Section:** 58036

**Date:** January 26 2026

**Dataset:** CIFAR-10

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
from PIL import Image
from pathlib import Path

## Part B: Load Raw Dataset

In [None]:
raw_data_path = Path('../../lab01_data_value_chain/data/raw/cifar-10-batches-py')

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

train_batches = []
for i in range(1, 6):
    batch_file = raw_data_path / f'data_batch_{i}'
    batch_data = unpickle(batch_file)
    train_batches.append(batch_data)

test_batch = unpickle(raw_data_path / 'test_batch')
meta = unpickle(raw_data_path / 'batches.meta')

train_images = np.concatenate([batch[b'data'] for batch in train_batches])
train_labels = np.concatenate([batch[b'labels'] for batch in train_batches])
test_images = test_batch[b'data']
test_labels = np.array(test_batch[b'labels'])

train_images = train_images.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
test_images = test_images.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)

all_images = np.concatenate([train_images, test_images])
all_labels = np.concatenate([train_labels, test_labels])

print(f"Total samples: {len(all_images)}")
print(f"Image shape: {all_images[0].shape}")
print(f"Data type: {all_images.dtype}")

In [None]:
label_names = [name.decode('utf-8') for name in meta[b'label_names']]

fig, axes = plt.subplots(2, 5, figsize=(12, 5))
random_indices = np.random.choice(len(all_images), 10, replace=False)

for idx, ax in enumerate(axes.flat):
    img_idx = random_indices[idx]
    ax.imshow(all_images[img_idx])
    ax.set_title(label_names[all_labels[img_idx]])
    ax.axis('off')

plt.tight_layout()
plt.savefig('../outputs/figures/lab02_raw_samples.png', dpi=150, bbox_inches='tight')
plt.show()

## Part C: Detect Data Quality Issues

In [None]:
issues = []
valid_samples = 0
invalid_samples = 0

for i in range(len(all_images)):
    img = all_images[i]
    label = all_labels[i]
    
    if img.shape != (32, 32, 3):
        issues.append(f"Sample {i}: Invalid shape {img.shape}")
        invalid_samples += 1
        continue
    
    if label < 0 or label >= 10:
        issues.append(f"Sample {i}: Invalid label {label}")
        invalid_samples += 1
        continue
    
    if np.isnan(img).any() or np.isinf(img).any():
        issues.append(f"Sample {i}: Contains NaN or Inf values")
        invalid_samples += 1
        continue
    
    valid_samples += 1

print(f"Valid samples: {valid_samples}")
print(f"Invalid samples: {invalid_samples}")

log_path = Path('../outputs/logs')
log_path.mkdir(parents=True, exist_ok=True)

with open(log_path / 'lab02_data_issues.txt', 'w') as f:
    f.write(f"Data Quality Report\n")
    f.write(f"Total samples: {len(all_images)}\n")
    f.write(f"Valid samples: {valid_samples}\n")
    f.write(f"Invalid samples: {invalid_samples}\n\n")
    f.write("Issues detected:\n")
    if issues:
        for issue in issues:
            f.write(f"  {issue}\n")
    else:
        f.write("  No issues detected\n")

## Part D: Data Cleaning

In [None]:
valid_indices = []

for i in range(len(all_images)):
    img = all_images[i]
    label = all_labels[i]
    
    if img.shape == (32, 32, 3) and 0 <= label < 10 and not (np.isnan(img).any() or np.isinf(img).any()):
        valid_indices.append(i)

clean_images = all_images[valid_indices]
clean_labels = all_labels[valid_indices]

print(f"Original dataset size: {len(all_images)}")
print(f"Cleaned dataset size: {len(clean_images)}")

summary_df = pd.DataFrame({
    'Metric': ['Original Size', 'Cleaned Size', 'Removed Samples'],
    'Count': [len(all_images), len(clean_images), len(all_images) - len(clean_images)]
})

summary_df.to_csv('../outputs/tables/lab02_cleaning_summary.csv', index=False)
print("\nCleaning Summary:")
print(summary_df)

## Part E: Image Preprocessing

In [None]:
def preprocess_images(images, target_size):
    processed = []
    for img in images:
        pil_img = Image.fromarray(img.astype('uint8'))
        resized = pil_img.resize((target_size, target_size), Image.BILINEAR)
        arr = np.array(resized, dtype=np.float32)
        normalized = arr / 255.0
        processed.append(normalized)
    return np.array(processed)

processed_64 = preprocess_images(clean_images, 64)
processed_128 = preprocess_images(clean_images, 128)

print(f"64x64 - Final shape: {processed_64[0].shape}")
print(f"64x64 - Value range: [{processed_64.min():.4f}, {processed_64.max():.4f}]")
print(f"128x128 - Final shape: {processed_128[0].shape}")
print(f"128x128 - Value range: [{processed_128.min():.4f}, {processed_128.max():.4f}]")

In [None]:
fig, axes = plt.subplots(3, 5, figsize=(15, 9))
sample_indices = np.random.choice(len(clean_images), 5, replace=False)

for idx in range(5):
    img_idx = sample_indices[idx]
    
    axes[0, idx].imshow(clean_images[img_idx].astype('uint8'))
    axes[0, idx].set_title(f"Raw (32x32)")
    axes[0, idx].axis('off')
    
    axes[1, idx].imshow(processed_64[img_idx])
    axes[1, idx].set_title(f"Processed (64x64)")
    axes[1, idx].axis('off')
    
    axes[2, idx].imshow(processed_128[img_idx])
    axes[2, idx].set_title(f"Processed (128x128)")
    axes[2, idx].axis('off')

plt.tight_layout()
plt.savefig('../outputs/figures/lab02_processed_samples.png', dpi=150, bbox_inches='tight')
plt.show()

## Part F: Save Processed Dataset

In [None]:
processed_path = Path('../data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

np.save(processed_path / 'images_64x64.npy', processed_64)
np.save(processed_path / 'images_128x128.npy', processed_128)
np.save(processed_path / 'labels.npy', clean_labels)

print("Saved processed datasets:")
print(f"  - images_64x64.npy: {processed_64.shape}")
print(f"  - images_128x128.npy: {processed_128.shape}")
print(f"  - labels.npy: {clean_labels.shape}")

In [None]:
from datetime import datetime

metadata = f"""Preprocessing Metadata
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Dataset: CIFAR-10
Original shape: (32, 32, 3)
Processed shapes: (64, 64, 3) and (128, 128, 3)
Normalization method: Min-Max scaling to [0, 1]
Data type: float32
Total samples: {len(clean_labels)}
Resizing method: Bilinear interpolation
"""

with open('../outputs/logs/lab02_preprocessing_metadata.txt', 'w') as f:
    f.write(metadata)

print(metadata)

## Results and Analysis

### A. Raw vs Processed Data Comparison

**Original:**
- Shape: (32, 32, 3)
- Data type: uint8
- Value range: [0, 255]

**Processed (64x64):**
- Shape: (64, 64, 3)
- Data type: float32
- Value range: [0.0, 1.0]

**Processed (128x128):**
- Shape: (128, 128, 3)
- Data type: float32
- Value range: [0.0, 1.0]

Images were resized to standard dimensions and normalized to improve training stability.

### B. Data Quality Findings

The CIFAR-10 dataset is clean with no corrupted samples detected. All images have consistent dimensions and valid labels. This demonstrates the importance of using well-maintained benchmark datasets, though real-world data often requires more extensive cleaning.

### C. Visual Comparison

The processed images maintain visual quality while standardizing dimensions. Normalization does not alter appearance but ensures pixel values are in an optimal range for neural network training.

### D. Impact on Model Readiness

Preprocessing improves training stability by ensuring consistent input dimensions and normalized value ranges. Without preprocessing, models may fail due to dimension mismatches or struggle with convergence due to unstandardized input scales.

## Questions

### 1. Why must corrupted samples be removed before training a model?

[Your answer here]

### 2. What problems can arise if images have inconsistent sizes?

[Your answer here]

### 3. Why is normalization important for gradient-based learning?

[Your answer here]

### 4. Why should raw data never be overwritten?

[Your answer here]

### 5. How does preprocessing affect model convergence and performance?

[Your answer here]

## Conclusion

[Write your conclusion here including:
- Dataset used and acquisition method
- Key observations during data quality inspection
- How preprocessing was structured and why
- Importance of data cleaning before model training]