<a href="https://colab.research.google.com/github/Nick-Yazdani/ml-notebooks-portfolio/blob/main/dnn_cnn_diamond_val.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

In [2]:
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [3]:
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

User uploaded file "kaggle.json" with length 70 bytes


In [4]:
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download "harshitlakhani/natural-diamonds-prices-images"

Downloading natural-diamonds-prices-images.zip to /content
 93% 143M/154M [00:01<00:00, 139MB/s]
100% 154M/154M [00:01<00:00, 140MB/s]


In [6]:
!unzip -qq natural-diamonds-prices-images.zip

In [7]:
!rm -rf Diamonds2

In [8]:
!rm natural-diamonds-prices-images.zip

In [70]:
import os
import re

import pandas as pd
import numpy as np
import tensorflow as tf

In [71]:
DATA_DIRECTORY = "Diamonds/Diamonds"
IMAGE_DIRECTORY = f"{DATA_DIRECTORY}/images"
FEATURES = ["shape", "weight", "clarity", "colour", "cut", "polish", "symmetry", "fluorescence", "length", "width", "depth"]
TARGET = ["price"]
IMAGE_SHAPE = (64, 64, 3)

In [72]:
def load_and_combine_diamond_csv_files(data_directory: str) -> pd.DataFrame:
  csv_list: list[str] = [f"{data_directory}/{csv_file}" for csv_file in os.listdir(data_directory) if csv_file.endswith(".csv")]
  diamond_dataframes: list[pd.DataFrame] = [pd.read_csv(csv_file) for csv_file in csv_list]
  df = pd.concat(diamond_dataframes)

  return df

In [73]:
def simplify_and_fix_colnames(df: pd.DataFrame) -> pd.DataFrame:
  simplified_column_names = [str.lower(col).replace(" ", "_") for col in df.columns.to_list()]

  df.columns = simplified_column_names

  df = df.rename(columns={"messurements": "measurements"})
  df = df.reset_index(drop=True)

  return df

In [74]:
def clean_and_seperate_measurement_entries(df: pd.DataFrame) -> pd.DataFrame:
  column_names = ["length", "width", "depth"]

  splitter_fn = lambda dimension: re.split(r"[^\d.]+", dimension)

  dimensions = list(map(splitter_fn, df.measurements.values))

  cols_to_add = list()

  for col_idx in range(3):
    col_to_add = list()

    for col in dimensions:
      col_to_add.append(col[col_idx])

    cols_to_add.append(np.asarray(col_to_add))

  cols_to_add = np.asarray(cols_to_add)

  df[column_names] = cols_to_add.T
  df[column_names] = df[column_names].astype(np.float32)

  return df

In [75]:
def clean_prices(price: np.ndarray) -> np.ndarray:
  price = price.astype(str).tolist()

  non_decimal = re.compile(r"[^\d.]+")

  price = [non_decimal.sub("", p) for p in price]

  return np.asarray(price)

In [76]:
df = load_and_combine_diamond_csv_files(DATA_DIRECTORY)
df = simplify_and_fix_colnames(df)
df = clean_and_seperate_measurement_entries(df)
df["price"] = clean_prices(df["price"].values) 

In [77]:
def map_images_with_df_ids(df: pd.DataFrame, image_directory: str) -> pd.DataFrame:
  df["file_name"] = ""
  df["id"] = df.apply(lambda x: str(x["id"]).strip(), axis=1)

  for dir in os.listdir(IMAGE_DIRECTORY):
    for img in os.listdir(os.path.join(IMAGE_DIRECTORY, dir)):
      for idx, row_id in enumerate(df["id"].values):
        if img.split(".")[0] == row_id:
          df.loc[idx, "file_name"] = os.path.join(IMAGE_DIRECTORY, dir, img)
  
  return df  

In [78]:
df = map_images_with_df_ids(df, IMAGE_DIRECTORY)

In [79]:
df = df.sample(frac=1.0, random_state=42)
df = df.reset_index(drop=True)
df = df[FEATURES + TARGET + ["file_name"]]
df = df.dropna()
df = df.reset_index(drop=True)

In [80]:
def train_test_val(df: pd.DataFrame):
  train = df.sample(frac=0.7, random_state=42)
  val = df.drop(train.index)
  test = val.sample(frac=0.5, random_state=42)

  val = val.drop(test.index)

  train = train.reset_index(drop=True)
  val = val.reset_index(drop=True)
  test = test.reset_index(drop=True)

  return (train, test, val)

In [81]:
def read_images(df: pd.DataFrame):
  images = []
  indices_to_drop: list[int] = []

  for idx, file_name in enumerate(df.file_name.values.astype(str)):
    if not os.path.exists(file_name):
      indices_to_drop.append(idx)
      continue
    image = tf.io.read_file(file_name)

    if file_name.endswith("jpg"):
      image = tf.io.decode_jpeg(image, channels=3)
    elif file_name.endswith("png"):
      image = tf.io.decode_png(image, channels=3)

    image = tf.cast(image, dtype=tf.float32)
    image /= 255.0
    image = tf.image.resize(image, (IMAGE_SHAPE[:-1]))

    images.append(image)

  images = tf.convert_to_tensor(images)

  if len(indices_to_drop) > 0:
    df = df.drop(indices_to_drop)

  df = df.drop("file_name", axis=1)

  return images, df

In [82]:
ttv_splits = train_test_val(df)

In [83]:
train, test, val = map(read_images, ttv_splits)

In [84]:
def preprocessing(train_df: pd.DataFrame):
  train_features = train_df.drop(TARGET, axis=1).copy()
  train_target = train_df[TARGET]

  inputs = dict()

  for name, column in train_features.items():
    dtype = column.dtype
    if dtype == object:
      dtype = tf.string
    else:
      dtype = tf.float32

    inputs[name] = tf.keras.Input(shape=(1,), name=f"{name}", dtype=dtype)
  
  numeric_inputs = {name: input for name, input in inputs.items() if input.dtype == tf.float32}

  

Unnamed: 0,shape,weight,clarity,colour,cut,polish,symmetry,fluorescence,length,width,depth,price
0,ROUND,0.19,IF,D,EX,EX,EX,N,3.67,3.69,2.30,893.18
1,CUSHION,0.71,SI1,H,EX,EX,VG,N,5.35,4.89,3.30,2021.49
2,ROUND,0.23,IF,G,EX,EX,EX,F,3.95,3.98,2.45,879.44
3,HEART,0.40,VS1,J,GD,EX,VG,M,4.32,5.00,3.06,1001.16
4,CUSHION,0.81,VS2,J,EX,EX,VG,N,5.42,5.09,3.45,2149.77
...,...,...,...,...,...,...,...,...,...,...,...,...
2409,ROUND,0.23,VS1,E,EX,EX,EX,N,3.91,3.93,2.47,904.71
2410,CUSHION,0.50,SI1,FANCY,GD,EX,VG,F,4.20,4.14,3.11,1018.38
2411,ROUND,0.23,VS1,E,EX,EX,EX,N,3.90,3.93,2.46,909.74
2412,CUSHION,1.03,VS2,U-V,EX,EX,VG,N,5.88,5.54,3.72,2515.78
