In [ ]:
import os, sys
sys.path.append(os.path.abspath(os.path.join('..')))
    
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import imagehash


from utils.visualization import data_count_plot

from utils import data

### Loading Dataframe

In [ ]:
base_path = '../data/raw/Furniture_Data'
df = data.load(base_path)

In [ ]:
df.describe()

In [ ]:
df.head()

### Histogram Plot

In [ ]:
def plot_histogram(df, column_name, bins=20, title=None):
    plt.figure(figsize=(10, 5))
    plt.hist(df[column_name], bins=bins, color='skyblue', edgecolor='black')
    plt.title(title or f'Distribution of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.show()

In [ ]:
# Plot histograms for numerical columns
plot_histogram(df, 'Width', title='Histogram of Image Widths')
plot_histogram(df, 'Height', title='Histogram of Image Heights')
plot_histogram(df, 'Ratio', title='Histogram of Image Ratios')

In [ ]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10))

data_count_plot(df, 'Type', ax=ax[0][0], title='File Types')
data_count_plot(df, 'Mode', ax=ax[0][1], title='Image Modes')
data_count_plot(df, 'Class', ax=ax[1][0], horizontal=True, title='Classes')
data_count_plot(df, 'Style', ax=ax[1][1], horizontal=True, title='Styles')

fig.suptitle('Raw Dataset Statistics', fontsize=20, fontweight='bold', y=1.0)
fig.tight_layout()

In [ ]:
# Alternative count plots

# plt.figure(figsize=(10, 10))
# data_count_plot(df, 'Type', title='File Types')
# plt.show()
#
# plt.figure(figsize=(10, 10))
# data_count_plot(df, 'Mode', title='Image Modes')
# plt.show()
#
# plt.figure(figsize=(10, 10))
# data_count_plot(df, 'Class', horizontal=True, title='Classes')
# plt.show()
#
# plt.figure(figsize=(10, 10))
# data_count_plot(df, 'Style', horizontal=True, title='Styles')
# plt.show()

In [ ]:
def find_near_duplicates(df, threshold=5):
    image_hashes = {}
    duplicates = []
  
    for row in df:
        # Calculate perceptual hash of the image
        image = Image.open(base_path + '/' + row['Path'])
        image_hash = imagehash.phash(image)

        print(image_hash)

        # Check if a similar hash already exists
        # is_duplicate = False
        # for existing_hash, paths in image_hashes.items():
        #     if image_hash - existing_hash <= threshold:
        #         paths.append(file_path)
        #         duplicates.append(paths)
        #         is_duplicate = True
        #         break

        # If the image is not a duplicate, add its hash to the dictionary
    #     if not is_duplicate:
    #         image_hashes[image_hash] = [file_path]

    # print(f"Duplicates found so far: {len(duplicates)}")
    # print("------------------------------------")

    return duplicates

duplicates = find_near_duplicates(df, 1)