# Preprocessing

---
## Imports

In [None]:
import sys

import pandas as pd
from sklearn.model_selection import train_test_split

from utils.augmentation import augment_image

sys.path.append('../')

from utils.duplicates import remove_rows, get_duplicates_to_delete

In [None]:
df = pd.read_csv('../data/processed/csv/df.csv')
duplicates = pd.read_csv('../data/processed/csv/duplicates.csv')

--- 
## Remove duplicates

### Automatically delete:
- For each duplicate group, delete all but one rows marked as "Duplicate" for each "Style".
- Outputs "df_no_dup".

In [None]:
duplicates_to_delete = get_duplicates_to_delete(duplicates)
df_no_dup = remove_rows(df, duplicates_to_delete)
df_no_dup

### Manually delete:
- Any rows marked as "Inspect" that belongs in the wrong "Class".
- Overwrite "df", as this DataFrame will continue to be used for further preprocessing.

In [None]:
inspects = duplicates[duplicates['Duplicate_Type'] == 'Inspect']
inspects

In [None]:
total_inspect_groups = inspects["Group"].nunique(dropna=False)

In [None]:
# inspects_rows_to_delete = [91, 154, 205, 227, 235, 277, 280, 281, 287, 290, 299, 310, 318, 323, 325] # Delete one of each pair
inspects_rows_to_delete = [91,
                           205]  # Only delete very different class ("tables" and "beds"). Similar classes are kept ("chairs" and "sofas")

In [None]:
inspect_review = inspects.copy()
inspect_review["Duplicate_Type"] = "Keep"
# inspect_review.loc[inspects_rows_to_delete, "Duplicate_Type"] = "DELETE"

In [None]:
# visualize_duplicates(inspect_review, total_inspect_groups)

In [None]:
inspects_to_delete = inspect_review[inspect_review["Duplicate_Type"] == "DELETE"]
inspects_to_delete

In [None]:
df = remove_rows(df, inspects_to_delete)
df

---
## Prepare DataFrame

In [None]:
import pandas as pd

# Assuming you have the original "df" and "duplicates_to_delete" DataFrames

# Merge the two DataFrames based on the "Path" column
merged_df = pd.merge(df, duplicates_to_delete[['Path']], on='Path', how='left', indicator=True)

# Create the "Duplicate_Type" column based on the merge indicator
merged_df['Duplicate_Type'] = merged_df['_merge'].map({'both': "Duplicate", 'left_only': "Unique"})

# Drop the merge indicator column
merged_df = merged_df.drop('_merge', axis=1)

# Update the original "df" with the new "Duplicate_Type" column
df = merged_df

In [None]:
df

---
## Split (train, validation, test)

### Prepare target and training

In [None]:
train_data = df.copy()

### Splitting

In [None]:
train_X, test_X = train_test_split(
    train_data,
    test_size=0.2,
    random_state=42
)
train_X, val_X = train_test_split(
    train_X,
    test_size=0.25,
    random_state=42
)

In [None]:
train_X

---
## Rescaling & Normalization
Note: Using Tensorflow for quick normalization and rescaling. In 'utils/tensorflow_preprocessing.py' file, there is a functions to normalize and rescale the each image in the dataset.

In [None]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
def process_image_from_path(image_path, img_height, img_width, to_augment):
    # Read image
    img = tf.io.read_file(image_path)

    # Decode to RGB
    img = tf.io.decode_jpeg(img, channels=3)

    # Resize
    img = tf.image.resize(img, [img_height, img_width])

    # Augment
    is_duplicate = tf.equal(to_augment, "Duplicate")
    img = tf.cond(is_duplicate, lambda: augment_image(img), lambda: img)

    # Rescale
    rescaling_layer = tf.keras.layers.Rescaling(scale=1. / 255)
    img = rescaling_layer(img) 
    # Commented out right now cause it makes all the output image black, which I'm not sure is supposed to happen

    return img

In [None]:
def prepare_image_dataset(df, img_height, img_width, batch_size, base_path='../data/raw/Furniture_Data',
                          label_encoder=None):
    prepared_df = df.assign(Path=df['Path'].apply(lambda path: base_path + "/" + path))

    # REMOVE (shorten the df for faster testing)
    # prepared_df = prepared_df.sample(frac=0.05, random_state=42)

    # Perform label encoding on the class labels
    if label_encoder is None:
        label_encoder = LabelEncoder()
        prepared_df['Class_Encoded'] = label_encoder.fit_transform(prepared_df['Class'])
    else:
        prepared_df['Class_Encoded'] = label_encoder.transform(prepared_df['Class'])

    dataset = tf.data.Dataset.from_tensor_slices(
        (prepared_df['Path'].values,
         prepared_df["Duplicate_Type"].values,
         prepared_df['Class_Encoded'].values)
    )

    image_ds = dataset.map(lambda path, duplicate_type, class_label:
                           (
                               process_image_from_path(image_path=path,
                                                       img_height=img_height,
                                                       img_width=img_width,
                                                       to_augment=duplicate_type),
                               class_label
                           ),
                           num_parallel_calls=tf.data.AUTOTUNE
                           )

    image_ds = image_ds.batch(batch_size)

    return image_ds, label_encoder

In [None]:
train_dataset, label_encoder = prepare_image_dataset(train_X, img_height=256, img_width=256, batch_size=32)
val_dataset, _ = prepare_image_dataset(val_X, img_height=256, img_width=256, batch_size=32, label_encoder=label_encoder)
test_dataset, _ = prepare_image_dataset(test_X, img_height=256, img_width=256, batch_size=32,
                                        label_encoder=label_encoder)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for images, labels in train_dataset.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow((images[i]*255).numpy().astype("uint8"))
        # plt.title(class_names[labels[i]])
        plt.axis("off")

# DON'T WORRY if the images here are black. Comment out the rescaling part in process_image_from_path() to see the images.

---
## Test

In [None]:
from tensorflow.keras import layers, models, Input
from tensorflow.keras.callbacks import ProgbarLogger

In [None]:
model = models.Sequential([
    Input(shape=(256, 256, 3)),
    layers.Conv2D(4, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(8, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(16, (3, 3), activation='relu'),
    layers.GlobalMaxPooling2D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5, seed=21),
    layers.Dense(128, activation='relu'),
    layers.Dropout(1, seed=42),
    layers.Dense(128, activation='sigmoid'),
    layers.Dense(6, activation='softmax')
])

In [None]:
# Step 3: Compile the Model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [None]:
# Step 4: Train the Model
epochs = 10
history = model.fit(
    train_dataset,
    epochs=epochs,
    batch_size=32,
    callbacks=[ProgbarLogger()]
)

In [None]:
# Step 5: Evaluate the Model
test_loss, test_accuracy = model.evaluate(test_dataset, steps=len(test_dataset))
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
print(train_dataset.element_spec)

In [None]:
print(test_dataset.element_spec)