# Preprocessing

---
## Imports

In [58]:
import sys

import pandas as pd
from sklearn.model_selection import train_test_split

sys.path.append('../')

from utils.duplicates import remove_rows, get_duplicates_to_delete

In [59]:
df = pd.read_csv('../data/processed/csv/df.csv')
duplicates = pd.read_csv('../data/processed/csv/duplicates.csv')

--- 
## Remove duplicates

### Automatically delete:
- For each duplicate group, delete all but one rows marked as "Duplicate" for each "Style".
- Outputs "df_no_dup".

In [60]:
duplicates_to_delete = get_duplicates_to_delete(duplicates)
df_no_dup = remove_rows(df, duplicates_to_delete)
df_no_dup

Identifying rows to delete...


100%|██████████| 5684/5684 [00:03<00:00, 1876.99it/s]

Removing rows...





Unnamed: 0,Path,Type,Width,Height,Ratio,Mode,Class,Style
0,tables/Eclectic/4029eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic
1,tables/Eclectic/4107eclectic-nightstands-and-b...,jpg,350,350,1.0,RGB,tables,Eclectic
2,tables/Eclectic/3885eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic
3,tables/Eclectic/4040eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic
4,tables/Eclectic/4171eclectic-coffee-tables.jpg,jpg,350,350,1.0,RGB,tables,Eclectic
...,...,...,...,...,...,...,...,...
84791,beds/Asian/3166asian-daybeds.jpg,jpg,224,224,1.0,RGB,beds,Asian
84792,beds/Asian/7733asian-panel-beds.jpg,jpg,224,224,1.0,RGB,beds,Asian
84793,beds/Asian/20802asian-platform-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian
84794,beds/Asian/7850asian-platform-beds.jpg,jpg,224,224,1.0,RGB,beds,Asian


### Manually delete:
- Any rows marked as "Inspect" that belongs in the wrong "Class".
- Overwrite "df", as this DataFrame will continue to be used for further preprocessing.

In [61]:
inspects = duplicates[duplicates['Duplicate_Type'] == 'Inspect']
inspects

Unnamed: 0,Group,Duplicate_Type,Path,Type,Width,Height,Ratio,Mode,Class,Style,Hash
90,41,Inspect,tables/Craftsman/28467craftsman-dressers.jpg,jpg,350,350,1.0,RGB,tables,Craftsman,fbea6ebbd105a451cd78b1ccae23cb8492f195ae1d2312...
91,41,Inspect,beds/Craftsman/4866craftsman-bed.jpg,jpg,350,350,1.0,RGB,beds,Craftsman,fbea6ebbd105a451cd78b1ccae23cb8492f195ae1d2312...
153,72,Inspect,tables/Industrial/37555industrial-entertainmen...,jpg,350,350,1.0,RGB,tables,Industrial,d6aa7d5a82ad6a0068540af5ad5596af9055d02b93f41f...
154,72,Inspect,dressers/Industrial/dresser3452345.jpg,jpg,350,350,1.0,RGB,dressers,Industrial,d6aa7d5a82ad6a0068540af5ad5596af9055d02b93f41f...
205,98,Inspect,dressers/Farmhouse/34120farmhouse-buffets-and-...,jpg,350,350,1.0,RGB,dressers,Farmhouse,8bfa4afe3c256aadd100341fc042955a70b5cb41947de6...
206,98,Inspect,tables/Farmhouse/28763farmhouse-console-tables...,jpg,350,350,1.0,RGB,tables,Farmhouse,8bfa4afe3c256aadd100341fc042955a70b5cb41947de6...
227,109,Inspect,tables/Modern/10363modern-nightstands-and-beds...,jpg,350,350,1.0,RGB,tables,Modern,b3980ddb3c66d39bd098f3a70c182c66c39ff068c99734...
228,109,Inspect,lamps/Contemporary/4023contemporary-table-lamp...,jpg,350,350,1.0,RGB,lamps,Contemporary,b3980ddb3c66d39bd098f3a70c182c66c39ff068c99734...
235,113,Inspect,dressers/Modern/2146modern-dressers.jpg,jpg,350,350,1.0,RGB,dressers,Modern,f3bbad52dccc13b9cc659725921158ce2dd594a5931146...
236,113,Inspect,tables/Modern/11484modern-dressers.jpg,jpg,350,350,1.0,RGB,tables,Modern,f3bbad52dccc13b9cc659725921158ce2dd594a5931146...


In [62]:
total_inspect_groups = inspects["Group"].nunique(dropna=False)

In [63]:
# inspects_rows_to_delete = [91, 154, 205, 227, 235, 277, 280, 281, 287, 290, 299, 310, 318, 323, 325] # Delete one of each pair
inspects_rows_to_delete = [91,
                           205]  # Only delete very different class ("tables" and "beds"). Similar classes are kept ("chairs" and "sofas")

In [64]:
inspect_review = inspects.copy()
inspect_review["Duplicate_Type"] = "Keep"
inspect_review.loc[inspects_rows_to_delete, "Duplicate_Type"] = "DELETE"

In [65]:
# visualize_duplicates(inspect_review, total_inspect_groups)

In [66]:
inspects_to_delete = inspect_review[inspect_review["Duplicate_Type"] == "DELETE"]
inspects_to_delete

Unnamed: 0,Group,Duplicate_Type,Path,Type,Width,Height,Ratio,Mode,Class,Style,Hash
91,41,DELETE,beds/Craftsman/4866craftsman-bed.jpg,jpg,350,350,1.0,RGB,beds,Craftsman,fbea6ebbd105a451cd78b1ccae23cb8492f195ae1d2312...
205,98,DELETE,dressers/Farmhouse/34120farmhouse-buffets-and-...,jpg,350,350,1.0,RGB,dressers,Farmhouse,8bfa4afe3c256aadd100341fc042955a70b5cb41947de6...


In [67]:
df = remove_rows(df, inspects_to_delete)
df

Removing rows...


Unnamed: 0,Path,Type,Width,Height,Ratio,Mode,Class,Style
0,tables/Eclectic/4029eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic
1,tables/Eclectic/4107eclectic-nightstands-and-b...,jpg,350,350,1.0,RGB,tables,Eclectic
2,tables/Eclectic/3885eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic
3,tables/Eclectic/4040eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic
4,tables/Eclectic/4171eclectic-coffee-tables.jpg,jpg,350,350,1.0,RGB,tables,Eclectic
...,...,...,...,...,...,...,...,...
90077,beds/Asian/3166asian-daybeds.jpg,jpg,224,224,1.0,RGB,beds,Asian
90078,beds/Asian/7733asian-panel-beds.jpg,jpg,224,224,1.0,RGB,beds,Asian
90079,beds/Asian/20802asian-platform-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian
90080,beds/Asian/7850asian-platform-beds.jpg,jpg,224,224,1.0,RGB,beds,Asian


---
## Prepare DataFrame

In [68]:
import pandas as pd

# Assuming you have the original "df" and "duplicates_to_delete" DataFrames

# Merge the two DataFrames based on the "Path" column
merged_df = pd.merge(df, duplicates_to_delete[['Path']], on='Path', how='left', indicator=True)

# Create the "Duplicate_Type" column based on the merge indicator
merged_df['Duplicate_Type'] = merged_df['_merge'].map({'both': "Duplicate", 'left_only': "Unique"})

# Drop the merge indicator column
merged_df = merged_df.drop('_merge', axis=1)

# Update the original "df" with the new "Duplicate_Type" column
df = merged_df

In [69]:
df

Unnamed: 0,Path,Type,Width,Height,Ratio,Mode,Class,Style,Duplicate_Type
0,tables/Eclectic/4029eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic,Unique
1,tables/Eclectic/4107eclectic-nightstands-and-b...,jpg,350,350,1.0,RGB,tables,Eclectic,Unique
2,tables/Eclectic/3885eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic,Unique
3,tables/Eclectic/4040eclectic-side-tables-and-e...,jpg,350,350,1.0,RGB,tables,Eclectic,Unique
4,tables/Eclectic/4171eclectic-coffee-tables.jpg,jpg,350,350,1.0,RGB,tables,Eclectic,Unique
...,...,...,...,...,...,...,...,...,...
90077,beds/Asian/3166asian-daybeds.jpg,jpg,224,224,1.0,RGB,beds,Asian,Unique
90078,beds/Asian/7733asian-panel-beds.jpg,jpg,224,224,1.0,RGB,beds,Asian,Unique
90079,beds/Asian/20802asian-platform-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian,Unique
90080,beds/Asian/7850asian-platform-beds.jpg,jpg,224,224,1.0,RGB,beds,Asian,Unique


---
## Split (train, validation, test)

### Prepare target and training

In [70]:
# df_no_ins
target_class = df[["Class"]].copy()
target_style = df[["Style"]].copy()

In [71]:
# Dropping all but "Path", since the values of all the other columns ("Type", "Width", "Height"...) is the same
# columns_to_keep = ["Path", "Duplicate_Type"]  # Change if needed
# columns_to_drop = [col for col in df.columns if col not in columns_to_keep]

# train_data = df.drop(columns=columns_to_drop, axis=1)
train_data = df.copy()

### Splitting

In [72]:
train_X, test_X, train_Y_class, test_Y_class, train_Y_style, test_Y_style = train_test_split(
    train_data,
    target_class,
    target_style,
    test_size=0.2,
    random_state=42
)
train_X, val_X, train_Y_class, val_Y_class, train_Y_style, val_Y_style = train_test_split(
    train_X,
    train_Y_class,
    train_Y_style,
    test_size=0.25,
    random_state=42
)

In [73]:
train_X

Unnamed: 0,Path,Type,Width,Height,Ratio,Mode,Class,Style,Duplicate_Type
86444,beds/Traditional/7201traditional-panel-beds.jpg,jpg,224,224,1.0,RGB,beds,Traditional,Unique
84797,beds/Transitional/7754transitional-panel-beds.jpg,jpg,224,224,1.0,RGB,beds,Transitional,Unique
54483,chairs/Midcentury/17154midcentury-dining-chair...,jpg,350,350,1.0,RGB,chairs,Midcentury,Unique
58208,chairs/Transitional/23150transitional-dining-c...,jpg,350,350,1.0,RGB,chairs,Transitional,Unique
19937,sofas/Contemporary/1307contemporary-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Contemporary,Unique
...,...,...,...,...,...,...,...,...,...
42959,lamps/Beach/16149beach-style-table-lamps.jpg,jpg,350,350,1.0,RGB,lamps,Beach,Unique
26202,lamps/Victorian/14916victorian-floor-lamps.jpg,jpg,350,350,1.0,RGB,lamps,Victorian,Unique
76462,dressers/Transitional/6673transitional-dresser...,jpg,350,350,1.0,RGB,dressers,Transitional,Unique
10575,tables/Beach/22391beach-style-console-tables.jpg,jpg,350,350,1.0,RGB,tables,Beach,Unique


---
## Rescaling & Normalization
Note: Using Tensorflow for quick normalization and rescaling. In 'utils/tensorflow_preprocessing.py' file, there is a functions to normalize and rescale the each image in the dataset.

In [74]:
import tensorflow as tf

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [75]:
import tensorflow as tf


def process_image_from_path(image_path, img_height, img_width, to_augment):
    # Read image
    img = tf.io.read_file(image_path)

    # Decode to RGB
    img = tf.io.decode_jpeg(img, channels=3)

    # Resize
    img = tf.image.resize(img, [img_height, img_width])

    # Augment
    is_duplicate = tf.equal(to_augment, "Duplicate")

    def augment_image_tf(img):
        # Flip horizontally
        img = tf.image.random_flip_left_right(img)
        # Adjust brightness
        img = tf.image.random_brightness(img, max_delta=0.2)
        # Adjust contrast
        img = tf.image.random_contrast(img, lower=0.8, upper=1.2)

        return img

    img = tf.cond(is_duplicate, lambda: augment_image_tf(img), lambda: img)

    # Rescale
    rescaling_layer = tf.keras.layers.Rescaling(scale=1. / 255)
    img = rescaling_layer(img)

    return img

In [76]:
def prepare_image_dataset_2(dataframe, img_height, img_width, batch_size, base_path='../data/raw/Furniture_Data'):
    df_array = [(base_path + '/' + path, is_duplicate, class_label) for path, is_duplicate, class_label in
                dataframe[['Path', 'Duplicate_Type', 'Class']].values.tolist()]

    df_ds = tf.data.Dataset.from_tensor_slices(df_array)

    image_ds = df_ds.map(lambda x: (
        process_image_from_path(image_path=x[0], img_height=img_height, img_width=img_width, to_augment=x[1]),
        x[2]
    ),
                         num_parallel_calls=tf.data.AUTOTUNE
                         )

    image_ds = image_ds.batch(batch_size)

    return image_ds

In [77]:
train_dataset = prepare_image_dataset_2(train_X, img_height=256, img_width=256, batch_size=32)

In [78]:
val_dataset = prepare_image_dataset_2(val_X, img_height=256, img_width=256, batch_size=32)

In [79]:
test_dataset = prepare_image_dataset_2(test_X, img_height=256, img_width=256, batch_size=32)

---
---
## Test

In [80]:
for x, y in train_dataset.take(1):
    print(x.shape, y)

(32, 256, 256, 3) tf.Tensor(
[b'beds' b'beds' b'chairs' b'chairs' b'sofas' b'chairs' b'lamps' b'lamps'
 b'chairs' b'lamps' b'tables' b'tables' b'tables' b'lamps' b'lamps'
 b'dressers' b'tables' b'chairs' b'lamps' b'dressers' b'tables'
 b'dressers' b'lamps' b'lamps' b'sofas' b'tables' b'lamps' b'dressers'
 b'lamps' b'lamps' b'tables' b'beds'], shape=(32,), dtype=string)


2024-05-12 16:56:54.238347: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


---
## Encoding

In [81]:
# # One-Hot Encoding
# train_Y_style = one_hot_encode(train_Y_style, 'Style')
# test_Y_style = one_hot_encode(test_Y_style, 'Style')
# val_Y_style = one_hot_encode(val_Y_style, 'Style')
# 
# # Label Encoding
# train_Y_class = label_encode(train_Y_class, 'Class')
# test_Y_class = label_encode(test_Y_class, 'Class')
# val_Y_class = label_encode(val_Y_class, 'Class')

In [82]:
# train_Y_style.head()

In [83]:
# train_Y_class.head()

---
## Saving the images