In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
df_ref = pd.read_csv("recompiled_features_dataset.csv", index_col=0)
df_ref.head()

### Drop scaled scaled

In [None]:
df_ref = df_ref.drop(df_ref.columns[df_ref.columns.str.startswith("scaled_")], axis = 1)

### Surface Level Info
The entire DataFrame is composed of 27318 rows (face) and 1049 columns (attributes)

In [None]:
df_ref.shape

In [None]:
df_ref.info()

In [None]:
df_ref.describe()

In [None]:
df_ref.columns

### Extracting of Necessary Attributes (Pixels)
---

In [None]:
# cols = ["updated_source_h", "updated_source_w", "face_index", "x1", "y1", "x2", "y2", #"used_mask",
#         "pixels", "scaled_pixels", "bg_pixels", "scaled_bg_pixels", "scaled_x1", "scaled_y1",
#         "scaled_x2", "scaled_y2", "bbox_area", "scaled_bbox_area", "lbbox_area", "scaled_lbbox_area",
#         "e_face_yn", "e_face_mp", "e_face_yf", "e_bg_yn", "e_bg_mp", "e_bg_yf", "e_bbox_yn",
#         "e_bbox_mp", "e_bbox_yf", "e_lbbox_yn", "e_lbbox_mp", "e_lbbox_yf"]

# df = df_ref[cols]
# df.info()

**Check for null values - NO NULL VALUES**

In [None]:
df = df_ref

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df.describe()

# Data Cleaning
1. Pixels Replacement
2. Clamping of Epsilon Values
---

### Replace Pixels and BG Pixels
---

In [None]:
df

In [None]:
# zero_save_indices = df["scaled_pixels"] == 0
# zero_save_indices

In [None]:
# no_pixels = df[df["pixels"] != 0]

In [None]:
df["pixels"] = np.where(df["pixels"] == 0, df["bg_pixels"], df["pixels"])
#df["scaled_pixels"] = np.where(df["scaled_pixels"] == 0, df["scaled_bg_pixels"], df["scaled_pixels"])

### Check for outliers
---

In [None]:
df

In [None]:
### All with outliers in at least one column with respect to pixels, source, epsilon, etc.
# Need to have no NaN values
# outliers = no_pixels[~(np.abs(stats.zscore(no_pixels)) < 3).all(axis=1)]
# outliers

### Clamping of Epsilon Values

In [None]:
eps_cols = ["e_face_yn", "e_face_mp", "e_face_yf", "e_bg_yn", "e_bg_mp", "e_bg_yf", "e_bbox_yn",
            "e_bbox_mp", "e_bbox_yf", "e_lbbox_yn", "e_lbbox_mp", "e_lbbox_yf"]

df[eps_cols].describe()

In [None]:
df[eps_cols] = df[eps_cols].clip(upper=3)

### Dropping the 3 value epsilons instead

In [None]:
drop_eps_cols = ["e_face_yn", "e_face_mp", "e_face_yf", 
                 "e_bbox_yn","e_bbox_mp", "e_bbox_yf", 
                 "e_lbbox_yf"]
test_df = df

for col in drop_eps_cols:
    test_df = test_df[test_df[col] < 3]

In [None]:
test_df.describe()

In [None]:
df = test_df

## CREATE NEW COLUMNS FOR EACH ROW
---
x, y, w, h = (row['x1'] + row['x2']) / 2, (row['y1'] + row['y2']) / 2, row['x2'] - row['x1'], row['y2'] - row['y1']

In [None]:
new_x = []
new_y = []
new_w = []
new_h = []

for idx, row in df.iterrows():
    x, y, w, h = (row['x1'] + row['x2']) / 2, (row['y1'] + row['y2']) / 2, row['x2'] - row['x1'], row['y2'] - row['y1']
    new_x.append(x)
    new_y.append(y)
    new_w.append(w)
    new_h.append(h)
    
df['x'] = new_x
df['y'] = new_y
df['w'] = new_w
df['h'] = new_h

In [None]:
df.describe()

## Normalization of non E values
---

In [None]:
area_cols = df.columns[df.columns.str.contains("area")]
print(df[area_cols].describe())
df[area_cols] = df[area_cols] / (416 * 416)

In [None]:
pixels_cols = df.columns[df.columns.str.contains("pixels")]
print(df[pixels_cols].describe())
df[pixels_cols] = df[pixels_cols] / (416 * 416)

In [None]:
df = df.drop(columns=["x1", "y1", "x2", "y2"])
dim_cols = ["x", "y", "w", "h"]
print(df[dim_cols].describe())
df[dim_cols] = df[dim_cols] / 415

In [None]:
print(df[area_cols].describe())
print(df[pixels_cols].describe())
print(df[dim_cols].describe())

In [None]:
df.columns[~df.columns.str.contains("BIN")]

In [None]:
save_df = df.copy()
save_df

In [None]:
# normalize_cols = ["updated_source_h", "updated_source_w", "x1", "y1", "x2", "y2",
#                  "pixels", "scaled_pixels", "bg_pixels", "scaled_bg_pixels", "scaled_x1",
#                  "scaled_y1", "scaled_x2", "scaled_y2", "bbox_area", "scaled_bbox_area",
#                  "lbbox_area", "scaled_lbbox_area", "x", "y", "w", "h"]

# for col in normalize_cols:
#     df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())   

In [None]:
df

In [None]:
df.to_csv('cleaned_eda_features.csv')

# EDA TO:DO
---
1. Check surface level boxplots for all columns
2. Get the Correlation Mapping
3. Identify the outliers from the boxplots for each appropriate column and then investigate
4. Get the seaborn regplot of pixels, scaled_pixels, bg_pixels, scaled_bg_pixels, bbox_area, scaled_bbox_area, lbbox_area, scaled_lbbox_area corresponding to each appropriate epsilon values
5. Standardization of Data if Bell Curve or Evenly Distributed
6. Else, Normalization
---
7. Specific Stuff

## Get the subset of columns with int or float values only so you can visualize
---

In [None]:
save_df = df.copy()

In [None]:
cols = ["updated_source_h", "updated_source_w", "face_index", "x1", "y1", "x2", "y2", "x", "y", "w", "h",#"used_mask",
        "pixels", "scaled_pixels", "bg_pixels", "scaled_bg_pixels", "scaled_x1", "scaled_y1",
        "scaled_x2", "scaled_y2", "bbox_area", "scaled_bbox_area", "lbbox_area", "scaled_lbbox_area",
        "e_face_yn", "e_face_mp", "e_face_yf", "e_bg_yn", "e_bg_mp", "e_bg_yf", "e_bbox_yn",
        "e_bbox_mp", "e_bbox_yf", "e_lbbox_yn", "e_lbbox_mp", "e_lbbox_yf"]

cols = [i for i in cols if not i.startswith("scaled_")]

df = save_df.copy()[cols]
df.info()

## 1 - Check Surface Level Box Plots For All Columns

In [None]:
len(df.columns.tolist())

In [None]:
fig = make_subplots(rows=7, cols=5, subplot_titles=df.columns.tolist())

x_count = 1
y_count = 1

for column in df:           
    fig.add_trace(go.Box(y=df[column], name=column), row=x_count, col=y_count)
    
    if y_count == 5:
        x_count += 1
        y_count = 1
    else:
        y_count += 1

fig.update_layout(height=3000, width=1200, title_text="BoxPlot")    
fig.show()

### Confirm the distribution skewness

In [None]:
fig = make_subplots(rows=7, cols=5, subplot_titles=df.columns.tolist())

x_count = 1
y_count = 1 

for column in df:           
    fig.add_trace(go.Histogram(x=df[column], name=column), row=x_count, col=y_count)
    
    if y_count == 5:
        x_count += 1
        y_count = 1
    else:
        y_count += 1

fig.update_layout(height=3000, width=1200, title_text="Histogram Plot")    
fig.show()

## 2 - Get the Correlation Heatmap

In [None]:
df_corr = df.corr()
fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = df_corr.columns,
        y = df_corr.index,
        z = np.array(df_corr),
        text=df_corr.values,
        texttemplate='%{text:.2f}'
    )
)
fig.update_layout(height=1080, width=1200, title_text="Heatmap")    
fig.show()

The results of the heatmap could identify possible correlations first before heading towards having regression plots

## 7 - Specific Stuff

### Face Segment YuNet

### Face Segment MediaPipe

### Face Segment YoloFace

### BBOX YuNet

### BBOX MediaPipe

### BBOX YoloFace

### BG YuNet

### BG MediaPipe

### BG YoloFace

### 2x BG YuNet

### 2x BG MediaPipe

### 2x BG YoloFace