<a href="https://colab.research.google.com/github/Nick-Yazdani/ml-notebooks-portfolio/blob/main/dnn_cnn_diamond_val.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
from google.colab import files

In [77]:
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [78]:
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

User uploaded file "kaggle.json" with length 70 bytes


In [79]:
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [80]:
!kaggle datasets download "harshitlakhani/natural-diamonds-prices-images"

Downloading natural-diamonds-prices-images.zip to /content
 96% 147M/154M [00:00<00:00, 223MB/s]
100% 154M/154M [00:00<00:00, 231MB/s]


In [81]:
!unzip natural-diamonds-prices-images.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: Diamonds2/images/cushion/1795562.png  
  inflating: Diamonds2/images/cushion/1795563.png  
  inflating: Diamonds2/images/cushion/1795564.png  
  inflating: Diamonds2/images/cushion/1795565.png  
  inflating: Diamonds2/images/cushion/1795566.png  
  inflating: Diamonds2/images/cushion/1795567.png  
  inflating: Diamonds2/images/cushion/1795568.png  
  inflating: Diamonds2/images/cushion/1795571.png  
  inflating: Diamonds2/images/cushion/1795572.png  
  inflating: Diamonds2/images/cushion/1795573.png  
  inflating: Diamonds2/images/cushion/1795575.png  
  inflating: Diamonds2/images/cushion/1795577.png  
  inflating: Diamonds2/images/cushion/1795578.png  
  inflating: Diamonds2/images/cushion/1795582.png  
  inflating: Diamonds2/images/cushion/1795583.png  
  inflating: Diamonds2/images/cushion/1795584.png  
  inflating: Diamonds2/images/cushion/1795587.png  
  inflating: Diamonds2/images/cushion/1796243.png  

In [82]:
!rm -rf Diamonds2

In [83]:
!rm natural-diamonds-prices-images.zip

In [84]:
import os
import re

import pandas as pd
import numpy as np

In [85]:
DATA_DIRECTORY = "Diamonds/Diamonds"
IMAGE_DIRECTORY = f"{DATA_DIRECTORY}/images"
FEATURES = ["shape", "weight", "clarity", "colour", "cut", "polish", "symmetry", "fluorescence", "length", "width", "depth"]
TARGET = ["price"]

In [86]:
def load_and_combine_diamond_csv_files(data_directory: str) -> pd.DataFrame:
  csv_list: list[str] = [f"{data_directory}/{csv_file}" for csv_file in os.listdir(data_directory) if csv_file.endswith(".csv")]
  diamond_dataframes: list[pd.DataFrame] = [pd.read_csv(csv_file) for csv_file in csv_list]
  df = pd.concat(diamond_dataframes)

  return df

In [87]:
def simplify_and_fix_colnames(df: pd.DataFrame) -> pd.DataFrame:
  simplified_column_names = [str.lower(col).replace(" ", "_") for col in df.columns.to_list()]

  df.columns = simplified_column_names

  df = df.rename(columns={"messurements": "measurements"})
  df = df.reset_index(drop=True)

  return df

In [88]:
def clean_and_seperate_measurement_entries(df: pd.DataFrame) -> pd.DataFrame:
  column_names = ["length", "width", "depth"]

  splitter_fn = lambda dimension: re.split(r"[^\d.]+", dimension)

  dimensions = list(map(splitter_fn, df.measurements.values))

  cols_to_add = list()

  for col_idx in range(3):
    col_to_add = list()

    for col in dimensions:
      col_to_add.append(col[col_idx])

    cols_to_add.append(np.asarray(col_to_add))

  cols_to_add = np.asarray(cols_to_add)

  df[column_names] = cols_to_add.T
  df[column_names] = df[column_names].astype(np.float32)

  return df

In [89]:
def clean_prices(price: np.ndarray) -> np.ndarray:
  price = price.astype(str).tolist()

  non_decimal = re.compile(r"[^\d.]+")

  price = [non_decimal.sub("", p) for p in price]

  return np.asarray(price)

In [90]:
df = load_and_combine_diamond_csv_files(DATA_DIRECTORY)
df = simplify_and_fix_colnames(df)
df = clean_and_seperate_measurement_entries(df)
df["price"] = clean_prices(df["price"].values) 

In [91]:
def map_images_with_df_ids(df: pd.DataFrame, image_directory: str) -> pd.DataFrame:
  df["file_name"] = ""
  df["id"] = df.apply(lambda x: str(x["id"]).strip(), axis=1)

  for dir in os.listdir(IMAGE_DIRECTORY):
    for img in os.listdir(os.path.join(os.getcwd(), IMAGE_DIRECTORY, dir)):
      for idx, row_id in enumerate(df["id"].values):
        if img.split(".")[0] == row_id:
          df.loc[idx, "file_name"] = os.path.join(os.getcwd(), IMAGE_DIRECTORY, dir, img)
  
  return df  

In [92]:
df = map_images_with_df_ids(df, IMAGE_DIRECTORY)