# Preprocessing

---
## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../')

from utils.duplicates import remove_rows, get_duplicates_to_delete, visualize_duplicates
from utils.rescale_image import resize_images_in_dataframe, display_random_images
from utils.normalization import process_images, insert_normalized_images
from utils.tensorflow_preprocessing import create_image_data_generator, prepare_image_dataset, show_batch
from utils.encoding import one_hot_encode, label_encode

In [2]:
df = pd.read_csv('../data/processed/csv/df.csv')
duplicates = pd.read_csv('../data/processed/csv/duplicates.csv')

--- 
## Remove duplicates

2 DataFrames will be outputted:

| Variables   | Description                                                       |
|-------------|-------------------------------------------------------------------|
| `df_no_dup` | DataFrame with all but one duplicates of each style removed       |
| `df_no_ins` | DataFrame with misclassified images manually reviewed and removed |

### Automatically delete:
- For each duplicate group, delete all but one rows marked as "Duplicate" for each "Style".
- Outputs "df_no_dup".

In [3]:
duplicates_to_delete = get_duplicates_to_delete(duplicates)
df_no_dup = remove_rows(df, duplicates_to_delete)
df_no_dup

Identifying rows to delete...


100%|██████████| 5684/5684 [00:06<00:00, 858.38it/s] 

Removing rows...





Unnamed: 0,Path,Type,Width,Height,Ratio,Mode,Class,Style
0,beds/Asian/19726asian-daybeds.jpg,jpg,350,350,1.0,RGB,beds,Asian
1,beds/Asian/20109asian-panel-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian
2,beds/Asian/20508asian-platform-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian
3,beds/Asian/20750asian-comforters-and-comforter...,jpg,350,350,1.0,RGB,beds,Asian
4,beds/Asian/20802asian-platform-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian
...,...,...,...,...,...,...,...,...
84790,tables/Victorian/5victorian-side-tables-and-en...,jpg,350,350,1.0,RGB,tables,Victorian
84791,tables/Victorian/6victorian-side-tables-and-en...,jpg,350,350,1.0,RGB,tables,Victorian
84792,tables/Victorian/7victorian-side-tables-and-en...,jpg,350,350,1.0,RGB,tables,Victorian
84793,tables/Victorian/8victorian-dining-tables.jpg,jpg,350,350,1.0,RGB,tables,Victorian


### Manually delete:
- Any rows marked as "Inspect" that belongs in the wrong "Class".
- Outputs "df_no_ins"

In [4]:
inspects = duplicates[duplicates['Duplicate_Type'] == 'Inspect']
inspects

Unnamed: 0,Group,Duplicate_Type,Path,Type,Width,Height,Ratio,Mode,Class,Style,Hash
117,59,Inspect,tables/Craftsman/28467craftsman-dressers.jpg,jpg,350,350,1.0,RGB,tables,Craftsman,fbea6ebbd105a451cd78b1ccae23cb8492f195ae1d2312...
118,59,Inspect,beds/Craftsman/4866craftsman-bed.jpg,jpg,350,350,1.0,RGB,beds,Craftsman,fbea6ebbd105a451cd78b1ccae23cb8492f195ae1d2312...
289,145,Inspect,chairs/Contemporary/1181contemporary-indoor-ch...,jpg,350,350,1.0,RGB,chairs,Contemporary,eaef9014e0503bfd81510ba385fa3f419faae0429ea4c7...
290,145,Inspect,sofas/Contemporary/2852contemporary-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Contemporary,eaef9014e0503bfd81510ba385fa3f419faae0429ea4c7...
297,149,Inspect,sofas/Contemporary/409contemporary-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Contemporary,aabf7f10b544686ec442c1bb390b906f42fe6a7e913d90...
298,149,Inspect,chairs/Contemporary/1446contemporary-indoor-ch...,jpg,350,350,1.0,RGB,chairs,Contemporary,aabf7f10b544686ec442c1bb390b906f42fe6a7e913d90...
415,205,Inspect,chairs/Contemporary/870contemporary-indoor-cha...,jpg,350,350,1.0,RGB,chairs,Contemporary,eafeb034e0503af5815123e195eb27439faac54ade84c7...
416,205,Inspect,sofas/Contemporary/1293contemporary-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Contemporary,eafeb034e0503af5815123e195eb27439faac54ade84c7...
445,220,Inspect,chairs/Eclectic/5037eclectic-armchairs-and-acc...,jpg,350,350,1.0,RGB,chairs,Eclectic,ebfa3fb59050e02fc047c84a64adcdc8e48c92ff8f20c3...
446,220,Inspect,sofas/Transitional/4509transitional-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Transitional,ebfa3fb59050e02fc047c84a64adcdc8e48c92ff8f20c3...


In [5]:
total_inspect_groups = inspects["Group"].nunique(dropna=False)

In [6]:
inspects_rows_to_delete = [91, 154, 205, 227, 235, 277, 280, 281, 287, 290, 299, 310, 318, 323, 325]

In [28]:
inspect_review = inspects.copy()
inspect_review["Duplicate_Type"] = "Keep"

117     Keep
118     Keep
289     Keep
290     Keep
297     Keep
298     Keep
415     Keep
416     Keep
445     Keep
446     Keep
714     Keep
715     Keep
718     Keep
719     Keep
782     Keep
783     Keep
784     Keep
785     Keep
833     Keep
834     Keep
903     Keep
904     Keep
1071    Keep
1072    Keep
1095    Keep
1096    Keep
1111    Keep
1112    Keep
5250    Keep
5251    Keep
Name: Duplicate_Type, dtype: object

In [8]:
inspect_review.loc[inspects_rows_to_delete, "Duplicate_Type"] = "DELETE"

Unnamed: 0,Group,Duplicate_Type,Path,Type,Width,Height,Ratio,Mode,Class,Style,Hash
117,59,Keep,tables/Craftsman/28467craftsman-dressers.jpg,jpg,350,350,1.0,RGB,tables,Craftsman,fbea6ebbd105a451cd78b1ccae23cb8492f195ae1d2312...
118,59,Keep,beds/Craftsman/4866craftsman-bed.jpg,jpg,350,350,1.0,RGB,beds,Craftsman,fbea6ebbd105a451cd78b1ccae23cb8492f195ae1d2312...
289,145,Keep,chairs/Contemporary/1181contemporary-indoor-ch...,jpg,350,350,1.0,RGB,chairs,Contemporary,eaef9014e0503bfd81510ba385fa3f419faae0429ea4c7...
290,145,Keep,sofas/Contemporary/2852contemporary-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Contemporary,eaef9014e0503bfd81510ba385fa3f419faae0429ea4c7...
297,149,Keep,sofas/Contemporary/409contemporary-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Contemporary,aabf7f10b544686ec442c1bb390b906f42fe6a7e913d90...
298,149,Keep,chairs/Contemporary/1446contemporary-indoor-ch...,jpg,350,350,1.0,RGB,chairs,Contemporary,aabf7f10b544686ec442c1bb390b906f42fe6a7e913d90...
415,205,Keep,chairs/Contemporary/870contemporary-indoor-cha...,jpg,350,350,1.0,RGB,chairs,Contemporary,eafeb034e0503af5815123e195eb27439faac54ade84c7...
416,205,Keep,sofas/Contemporary/1293contemporary-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Contemporary,eafeb034e0503af5815123e195eb27439faac54ade84c7...
445,220,Keep,chairs/Eclectic/5037eclectic-armchairs-and-acc...,jpg,350,350,1.0,RGB,chairs,Eclectic,ebfa3fb59050e02fc047c84a64adcdc8e48c92ff8f20c3...
446,220,Keep,sofas/Transitional/4509transitional-sofas.jpg,jpg,350,350,1.0,RGB,sofas,Transitional,ebfa3fb59050e02fc047c84a64adcdc8e48c92ff8f20c3...


In [None]:
visualize_duplicates(inspect_review, total_inspect_groups)

In [9]:
inspects_to_delete = inspect_review[inspect_review["Duplicate_Type"] == "DELETE"]
inspects_to_delete

Unnamed: 0,Group,Duplicate_Type,Path,Type,Width,Height,Ratio,Mode,Class,Style,Hash


In [10]:
df_no_ins = remove_rows(df, inspects_to_delete)
df_no_ins

Removing rows...


Unnamed: 0,Path,Type,Width,Height,Ratio,Mode,Class,Style
0,beds/Asian/19726asian-daybeds.jpg,jpg,350,350,1.0,RGB,beds,Asian
1,beds/Asian/20027asian-canopy-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian
2,beds/Asian/20109asian-panel-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian
3,beds/Asian/20508asian-platform-beds.jpg,jpg,350,350,1.0,RGB,beds,Asian
4,beds/Asian/20750asian-comforters-and-comforter...,jpg,350,350,1.0,RGB,beds,Asian
...,...,...,...,...,...,...,...,...
90078,tables/Victorian/5victorian-side-tables-and-en...,jpg,350,350,1.0,RGB,tables,Victorian
90079,tables/Victorian/6victorian-side-tables-and-en...,jpg,350,350,1.0,RGB,tables,Victorian
90080,tables/Victorian/7victorian-side-tables-and-en...,jpg,350,350,1.0,RGB,tables,Victorian
90081,tables/Victorian/8victorian-dining-tables.jpg,jpg,350,350,1.0,RGB,tables,Victorian


---
## Split (train, validation, test)

This part is complicated since there are 2 starting DataFrames (duplicates cleaned / inspects cleaned), and 2 target classes (Class / Style). 
Please refer to this table for the variables after splitting.

#### Duplicates cleaned
| Train                  | Validation           | Test                  | Description   |
|------------------------|----------------------|-----------------------|---------------|
| `no_dup_train_X`       | `no_dup_val_X`       | `no_dup_test_X`       | Data features |
| `no_dup_train_Y_class` | `no_dup_val_Y_class` | `no_dup_test_Y_class` | Target class  |
| `no_dup_train_Y_style` | `no_dup_val_Y_style` | `no_dup_test_Y_style` | Target style  |

#### Manual inspection cleaned
| Train                  | Validation           | Test                  | Description   |
|------------------------|----------------------|-----------------------|---------------|
| `no_ins_train_X`       | `no_ins_val_X`       | `no_ins_test_X`       | Data features |
| `no_ins_train_Y_class` | `no_ins_val_Y_class` | `no_ins_test_Y_class` | Target class  |
| `no_ins_train_Y_style` | `no_ins_val_Y_style` | `no_ins_test_Y_style` | Target style  |


### Prepare target and training

In [20]:
# df_no_dup
df_no_dup_target_class = df_no_dup[["Class"]].copy()
df_no_dup_target_style = df_no_dup[["Style"]].copy()

# df_no_ins
df_no_ins_target_class = df_no_ins[["Class"]].copy()
df_no_ins_target_style = df_no_ins[["Style"]].copy()

In [21]:
# Currently dropping all but "Path", since the values of all the other columns ("Type", "Width", "Height"...) is the same
columns_to_keep = ["Path"] # Change later if needed
columns_to_drop = [col for col in df.columns if col not in columns_to_keep]

df_no_dup_train = df_no_dup.drop(columns=columns_to_drop, axis=1)
df_no_ins_train = df_no_ins.drop(columns=columns_to_drop, axis=1)

### Splitting

In [22]:
# df_no_dup
no_dup_train_X, no_dup_test_X, no_dup_train_Y_class, no_dup_test_Y_class, no_dup_train_Y_style, no_dup_test_Y_style = train_test_split(
    df_no_dup_train,
    df_no_dup_target_class,
    df_no_dup_target_style,
    test_size=0.2,
    random_state=42
)
no_dup_train_X, no_dup_val_X, no_dup_train_Y_class, no_dup_val_Y_class, no_dup_train_Y_style, no_dup_val_Y_style = train_test_split(
    no_dup_train_X,
    no_dup_train_Y_class,
    no_dup_train_Y_style,
    test_size=0.25,
    random_state=42
)

# df_no_ins
no_ins_train_X, no_ins_test_X, no_ins_train_Y_class, no_ins_test_Y_class, no_ins_train_Y_style, no_ins_test_Y_style = train_test_split(
    df_no_ins_train,
    df_no_ins_target_class,
    df_no_ins_target_style,
    test_size=0.2,
    random_state=42
)
no_ins_train_X, no_ins_val_X, no_ins_train_Y_class, no_ins_val_Y_class, no_ins_train_Y_style, no_ins_val_Y_style = train_test_split(
    no_ins_train_X,
    no_ins_train_Y_class,
    no_ins_train_Y_style,
    test_size=0.25,
    random_state=42
)

Note: please refer to the tables under "Split (train, validation, test)" for easier understanding

---
## Augmentation

In [31]:
dup_paths = inspect_review[inspect_review["Duplicate_Type"] == "Keep"]["Path"]

In [44]:
import os
from utils.augmentation import augment_image
from PIL import Image

base_path = '../data/raw/Furniture_Data'
output_dir = '../data/augment/'

for path in dup_paths:
    input_path = os.path.join(base_path, path)
    filename = os.path.basename(path)
    output_path = os.path.join(output_dir, filename)

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    img = Image.open(input_path)
    augment_img = augment_image(img)
    augment_img.save(output_path, "JPEG")

---
## Rescaling & Normalization
Note: Using Tensorflow for quick normalization and rescaling. In 'utils/tensorflow_preprocessing.py' file, there is a functions to normalize and rescale the each image in the dataset.

In [14]:
import tensorflow as tf
AUTOTUNE = tf.data.experimental.AUTOTUNE

tf.__version__

In [15]:
base_path = '../data/raw/Furniture_Data'

dup_train_datagen = create_image_data_generator()
image_paths = no_dup_train_X['Path'].tolist()
dup_train_generator = prepare_image_dataset(image_paths, 256, 256, 32, dup_train_datagen, base_path)

In [18]:
dup_val_datagen = create_image_data_generator()
image_paths = no_dup_train_X['Path'].tolist()
dup_val_generator = prepare_image_dataset(image_paths, 256, 256, 32, dup_val_datagen, base_path)

In [19]:
dup_test_datagen = create_image_data_generator()
image_paths = no_dup_train_X['Path'].tolist()
dup_test_generator = prepare_image_dataset(image_paths, 256, 256, 32, dup_test_datagen, base_path)

In [20]:
ins_train_datagen = create_image_data_generator()
image_paths = no_dup_train_X['Path'].tolist()
ins_train_generator = prepare_image_dataset(image_paths, 256, 256, 32, ins_train_datagen, base_path)

In [21]:
ins_val_datagen = create_image_data_generator()
image_paths = no_dup_train_X['Path'].tolist()
ins_val_generator = prepare_image_dataset(image_paths, 256, 256, 32, ins_val_datagen, base_path)

In [22]:
ins_test_datagen = create_image_data_generator()
image_paths = no_dup_train_X['Path'].tolist()
ins_test_generator = prepare_image_dataset(image_paths, 256, 256, 32, ins_test_datagen, base_path)

---
### Testing code to check if the images are clearly normalized (Can delete after checking)

In [16]:
# Visualize the first batch in dup_train_generator
for image_batch, path_batch in dup_train_generator.take(1):
    show_batch(image_batch.numpy(), path_batch.numpy().astype(str))

---
## Encoding

In [23]:
# One-Hot Encoding
no_dup_train_Y_style = one_hot_encode(no_dup_train_Y_style, 'Style')
no_dup_test_Y_style = one_hot_encode(no_dup_test_Y_style, 'Style')
no_dup_val_Y_style = one_hot_encode(no_dup_val_Y_style, 'Style')

no_ins_train_Y_style = one_hot_encode(no_ins_train_Y_style, 'Style')
no_ins_test_Y_style = one_hot_encode(no_ins_test_Y_style, 'Style')
no_ins_val_Y_style = one_hot_encode(no_ins_val_Y_style, 'Style')

# Label Encoding
no_dup_train_Y_class = label_encode(no_dup_train_Y_class, 'Class')
no_dup_test_Y_class = label_encode(no_dup_test_Y_class, 'Class')
no_dup_val_Y_class = label_encode(no_dup_val_Y_class, 'Class')

no_ins_train_Y_class = label_encode(no_ins_train_Y_class, 'Class')
no_ins_test_Y_class = label_encode(no_ins_test_Y_class, 'Class')
no_ins_val_Y_class = label_encode(no_ins_val_Y_class, 'Class')

In [24]:
no_dup_train_Y_style.head()

In [25]:
no_dup_train_Y_class.head()