In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import time
import joblib

# --- Step 1: Data Loading and Preprocessing ---
print("--- Step 1: Loading and Preprocessing Data ---")
start_time = time.time()

# Define the path to your data directory relative to the notebook's location
data_dir = '../data'

# Get the list of class names from the folder names
try:
    class_names = sorted([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])
except FileNotFoundError:
    print(f"Error: The directory '{data_dir}' was not found. Please ensure your data is in the correct folder.")
    # Exit the notebook gracefully if data directory is not found
    exit()
    
print(f"Found {len(class_names)} classes: {class_names}")

images = []
labels = []

# Loop over each class folder
for class_name in class_names:
    class_path = os.path.join(data_dir, class_name)
    image_files = os.listdir(class_path)
    
    # Loop over each image in the class folder
    for image_file in image_files:
        if image_file.endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(class_path, image_file)
            
            # Read the image using OpenCV
            image = cv2.imread(image_path)
            
            if image is not None:
                # OpenCV reads images in BGR format, convert it to RGB for consistency
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                
                # Resize the image to a standard 64x64
                image = cv2.resize(image, (64, 64))
                
                # Normalize pixel values to be between 0 and 1
                image = image.astype('float32') / 255.0
                
                images.append(image)
                labels.append(class_name)

# Convert lists to NumPy arrays for efficient processing
images = np.array(images)

# Encode text labels to numbers
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

end_time = time.time()
print(f"Data loading complete. Took {end_time - start_time:.2f} seconds.")
print(f"Total images loaded: {len(images)}")
print(f"Shape of image data: {images.shape}")
print(f"Shape of labels: {labels_encoded.shape}")
print("------------------------------\n")


# --- Step 2: Feature Extraction (Color Histograms) ---
print("--- Step 2: Extracting Features and Splitting Data ---")
start_time = time.time()

features = []
for img in images:
    # Calculate histograms for each color channel
    hist_r = cv2.calcHist([img], [0], None, [32], [0, 1])
    hist_g = cv2.calcHist([img], [1], None, [32], [0, 1])
    hist_b = cv2.calcHist([img], [2], None, [32], [0, 1])
    
    # Concatenate the histograms and flatten them to create a single feature vector
    features.append(np.concatenate((hist_r, hist_g, hist_b)).flatten())

features = np.array(features)
print(f"Feature extraction complete. Shape of feature data: {features.shape}")

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    labels_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels_encoded
)

end_time = time.time()
print(f"Data splitting complete. Took {end_time - start_time:.2f} seconds.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print("------------------------------\n")


# --- Step 3: Dimensionality Reduction with PCA ---
print("--- Step 3: Applying PCA for Dimensionality Reduction ---")
start_time = time.time()

# Initialize PCA to reduce dimensions to 50
pca = PCA(n_components=50, random_state=42)

# Fit PCA on the training data and transform both training and test data
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Define paths for saving models and results
base_path = os.getcwd() # Gets the current notebook directory
models_path = os.path.join(base_path, '..', 'models')
os.makedirs(models_path, exist_ok=True) # Ensure the directory exists

# Save the fitted PCA object so we can use it in the app
pca_save_path = os.path.join(models_path, 'pca.joblib')
joblib.dump(pca, pca_save_path)

end_time = time.time()
print(f"PCA application complete. Took {end_time - start_time:.2f} seconds.")
print(f"Original feature shape: {X_train.shape}")
print(f"Shape after PCA: {X_train_pca.shape}")
print(f"PCA object saved to '{pca_save_path}'")
print("------------------------------\n")


# --- Step 4: Saving Processed Data for the Next Notebook ---
print("--- Step 4: Saving Processed Data ---")

# Define path for saving processed data
results_path = os.path.join(base_path, '..', 'results')
os.makedirs(results_path, exist_ok=True)

# Save the final NumPy arrays to be used by the model training notebook
np.save(os.path.join(results_path, 'X_train_pca.npy'), X_train_pca)
np.save(os.path.join(results_path, 'X_test_pca.npy'), X_test_pca)
np.save(os.path.join(results_path, 'y_train.npy'), y_train)
np.save(os.path.join(results_path, 'y_test.npy'), y_test)
np.save(os.path.join(results_path, 'class_names.npy'), label_encoder.classes_)

print("All processed data has been saved to the '../results' folder.")
print("You are now ready to run the 'model_training.ipynb' notebook.")


--- Step 1: Loading and Preprocessing Data ---
Found 10 classes: ['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River', 'SeaLake']
Data loading complete. Took 535.43 seconds.
Total images loaded: 27000
Shape of image data: (27000, 64, 64, 3)
Shape of labels: (27000,)
------------------------------

--- Step 2: Extracting Features and Splitting Data ---
Feature extraction complete. Shape of feature data: (27000, 96)
Data splitting complete. Took 3.57 seconds.
Shape of X_train: (21600, 96)
Shape of X_test: (5400, 96)
------------------------------

--- Step 3: Applying PCA for Dimensionality Reduction ---
PCA application complete. Took 0.55 seconds.
Original feature shape: (21600, 96)
Shape after PCA: (21600, 50)
PCA object saved to 'e:\GITAM SEMESTERS\5th-sem\MLA - CSEN3261\ML_TERM-PROJECT\ML_TERM-PROJECT\notebooks\..\models\pca.joblib'
------------------------------

--- Step 4: Saving Processed Data ---
All process