# Preprocessing

---
## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../')

from utils.duplicates import remove_rows, get_duplicates_to_delete, visualize_duplicates
from utils.rescale_image import resize_images_in_dataframe, display_random_images
from utils.normalization import process_images, insert_normalized_images
from utils.encoding import one_hot_encode, label_encode
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../data/processed/csv/df.csv')
duplicates = pd.read_csv('../data/processed/csv/duplicates.csv')

--- 
## Remove duplicates

2 DataFrames will be outputted:

| Variables   | Description                                                       |
|-------------|-------------------------------------------------------------------|
| `df_no_dup` | DataFrame with all but one duplicates of each style removed       |
| `df_no_ins` | DataFrame with misclassified images manually reviewed and removed |

### Automatically delete:
- For each duplicate group, delete all but one rows marked as "Duplicate" for each "Style".
- Outputs "df_no_dup".

In [None]:
duplicates_to_delete = get_duplicates_to_delete(duplicates)
df_no_dup = remove_rows(df, duplicates_to_delete)
df_no_dup

### Manually delete:
- Any rows marked as "Inspect" that belongs in the wrong "Class".
- Outputs "df_no_ins"

In [None]:
inspects = duplicates[duplicates['Duplicate_Type'] == 'Inspect']
inspects

In [None]:
total_inspect_groups = inspects["Group"].nunique(dropna=False)

In [None]:
inspects_rows_to_delete = [91, 154, 205, 227, 235, 277, 280, 281, 287, 290, 299, 310, 318, 323, 325]

In [None]:
inspect_review = inspects.copy()
inspect_review["Duplicate_Type"] = "Keep"
inspect_review.loc[inspects_rows_to_delete, "Duplicate_Type"] = "DELETE"

In [None]:
visualize_duplicates(inspect_review, total_inspect_groups)

In [None]:
inspects_to_delete = inspect_review[inspect_review["Duplicate_Type"] == "DELETE"]
inspects_to_delete

In [None]:
df_no_ins = remove_rows(df, inspects_to_delete)
df_no_ins

---
## Split (train, validation, test)

This part is complicated since there are 2 starting DataFrames (duplicates cleaned / inspects cleaned), and 2 target classes (Class / Style). 
Please refer to this table for the variables after splitting.

#### Duplicates cleaned
| Train                  | Validation           | Test                  | Description   |
|------------------------|----------------------|-----------------------|---------------|
| `no_dup_train_X`       | `no_dup_val_X`       | `no_dup_test_X`       | Data features |
| `no_dup_train_Y_class` | `no_dup_val_Y_class` | `no_dup_test_Y_class` | Target class  |
| `no_dup_train_Y_style` | `no_dup_val_Y_style` | `no_dup_test_Y_style` | Target style  |

#### Manual inspection cleaned
| Train                  | Validation           | Test                  | Description   |
|------------------------|----------------------|-----------------------|---------------|
| `no_ins_train_X`       | `no_ins_val_X`       | `no_ins_test_X`       | Data features |
| `no_ins_train_Y_class` | `no_ins_val_Y_class` | `no_ins_test_Y_class` | Target class  |
| `no_ins_train_Y_style` | `no_ins_val_Y_style` | `no_ins_test_Y_style` | Target style  |


### Prepare target and training

In [None]:
# df_no_dup
df_no_dup_target_class = df_no_dup[["Class"]].copy()
df_no_dup_target_style = df_no_dup[["Style"]].copy()

# df_no_ins
df_no_ins_target_class = df_no_ins[["Class"]].copy()
df_no_ins_target_style = df_no_ins[["Style"]].copy()

In [None]:
# Currently dropping all but "Path", since the values of all the other columns ("Type", "Width", "Height"...) is the same
columns_to_keep = ["Path"] # Change later if needed
columns_to_drop = [col for col in df.columns if col not in columns_to_keep]

df_no_dup_train = df_no_dup.drop(columns=columns_to_drop, axis=1)
df_no_ins_train = df_no_ins.drop(columns=columns_to_drop, axis=1)

### Splitting

In [None]:
# df_no_dup
no_dup_train_X, no_dup_test_X, no_dup_train_Y_class, no_dup_test_Y_class, no_dup_train_Y_style, no_dup_test_Y_style = train_test_split(
    df_no_dup_train,
    df_no_dup_target_class,
    df_no_dup_target_style,
    test_size=0.2,
    random_state=42
)
no_dup_train_X, no_dup_val_X, no_dup_train_Y_class, no_dup_val_Y_class, no_dup_train_Y_style, no_dup_val_Y_style = train_test_split(
    no_dup_train_X,
    no_dup_train_Y_class,
    no_dup_train_Y_style,
    test_size=0.25,
    random_state=42
)

# df_no_ins
no_ins_train_X, no_ins_test_X, no_ins_train_Y_class, no_ins_test_Y_class, no_ins_train_Y_style, no_ins_test_Y_style = train_test_split(
    df_no_ins_train,
    df_no_ins_target_class,
    df_no_ins_target_style,
    test_size=0.2,
    random_state=42
)
no_ins_train_X, no_ins_val_X, no_ins_train_Y_class, no_ins_val_Y_class, no_ins_train_Y_style, no_ins_val_Y_style = train_test_split(
    no_ins_train_X,
    no_ins_train_Y_class,
    no_ins_train_Y_style,
    test_size=0.25,
    random_state=42
)

Note: please refer to the tables under "Split (train, validation, test)" for easier understanding

---
## Rescaling

Note: Proceed the rescaling only with data features since the target dataframes don't have 'Path' column.

In [None]:
no_dup_train_X.head()

In [None]:
base_path = '../data/raw/Furniture_Data'

no_dup_train_X = resize_images_in_dataframe(no_dup_train_X, base_path)
no_dup_test_X = resize_images_in_dataframe(no_dup_test_X, base_path)
no_dup_val_X = resize_images_in_dataframe(no_dup_val_X, base_path)

no_ins_train_X = resize_images_in_dataframe(no_ins_train_X, base_path)
no_ins_test_X = resize_images_in_dataframe(no_ins_test_X, base_path)
no_ins_val_X = resize_images_in_dataframe(no_ins_val_X, base_path)

In [None]:
no_dup_train_X.head()

In [None]:
display_random_images(no_dup_train_X)

In [None]:
display_random_images(no_ins_train_X)

---
## Normalization (pixel)

In [None]:
no_dup_train_X['NormalizedImage'] = pd.Series([np.nan] * len(no_dup_train_X))
no_dup_test_X['NormalizedImage'] = pd.Series([np.nan] * len(no_dup_test_X))
no_dup_val_X['NormalizedImage'] = pd.Series([np.nan] * len(no_dup_val_X))
no_ins_train_X['NormalizedImage'] = pd.Series([np.nan] * len(no_ins_train_X))
no_ins_test_X['NormalizedImage'] = pd.Series([np.nan] * len(no_ins_test_X))
no_ins_val_X['NormalizedImage'] = pd.Series([np.nan] * len(no_ins_val_X))

In [None]:
chunk_size = 10000
save_path = '../data/processed/csv'

process_images('no_dup_train_X', no_dup_train_X, chunk_size, save_path)
process_images('no_dup_test_X', no_dup_test_X, chunk_size, save_path)
process_images('no_dup_val_X', no_dup_val_X, chunk_size, save_path)

In [None]:
process_images('no_ins_train_X', no_ins_train_X, chunk_size, save_path)
process_images('no_ins_test_X', no_ins_test_X, chunk_size, save_path)
process_images('no_ins_val_X', no_ins_val_X, chunk_size, save_path)

In [None]:
no_dup_train_X = insert_normalized_images('no_dup_train_X', no_dup_train_X, save_path)
no_dup_test_X = insert_normalized_images('no_dup_test_X', no_dup_test_X, save_path)
no_dup_val_X = insert_normalized_images('no_dup_val_X', no_dup_val_X, save_path)

In [None]:
no_ins_train_X = insert_normalized_images('no_ins_train_X', no_ins_train_X, save_path)
no_ins_test_X = insert_normalized_images('no_ins_test_X', no_ins_test_X, save_path)
no_ins_val_X = insert_normalized_images('no_ins_val_X', no_ins_val_X, save_path)

In [None]:
no_dup_train_X.head()

In [None]:
no_dup_test_X.head()

In [None]:
no_dup_val_X.head()

In [None]:
no_ins_train_X.head()

In [None]:
no_ins_test_X.head()

In [None]:
no_ins_val_X.head()

---
## Augmentation

---
## Encoding

In [None]:
# One-Hot Encoding
no_dup_train_Y_style = one_hot_encode(no_dup_train_Y_style, 'Style')
no_dup_test_Y_style = one_hot_encode(no_dup_test_Y_style, 'Style')
no_dup_val_Y_style = one_hot_encode(no_dup_val_Y_style, 'Style')

no_ins_train_Y_style = one_hot_encode(no_ins_train_Y_style, 'Style')
no_ins_test_Y_style = one_hot_encode(no_ins_test_Y_style, 'Style')
no_ins_val_Y_style = one_hot_encode(no_ins_val_Y_style, 'Style')

# Label Encoding
no_dup_train_Y_class = label_encode(no_dup_train_Y_class, 'Class')
no_dup_test_Y_class = label_encode(no_dup_test_Y_class, 'Class')
no_dup_val_Y_class = label_encode(no_dup_val_Y_class, 'Class')

no_ins_train_Y_class = label_encode(no_ins_train_Y_class, 'Class')
no_ins_test_Y_class = label_encode(no_ins_test_Y_class, 'Class')
no_ins_val_Y_class = label_encode(no_ins_val_Y_class, 'Class')

In [None]:
no_dup_train_Y_style.head()

In [None]:
no_dup_train_Y_class.head()