<a href="https://colab.research.google.com/github/Nick-Yazdani/ml-notebooks-portfolio/blob/main/dnn_cnn_diamond_val.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import files

In [7]:
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [8]:
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

User uploaded file "kaggle.json" with length 70 bytes


In [9]:
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [10]:
!kaggle datasets download "harshitlakhani/natural-diamonds-prices-images"

Downloading natural-diamonds-prices-images.zip to /content
 97% 149M/154M [00:01<00:00, 114MB/s]
100% 154M/154M [00:01<00:00, 95.4MB/s]


In [None]:
!unzip natural-diamonds-prices-images.zip

In [14]:
!rm -rf Diamonds2

In [16]:
!rm natural-diamonds-prices-images.zip

In [40]:
import os
import re

import pandas as pd
import numpy as np

In [27]:
DATA_DIRECTORY = "Diamonds/Diamonds"

In [30]:
def load_and_combine_diamond_csv_files(data_directory: str) -> pd.DataFrame:
  csv_list: list[str] = [f"{data_directory}/{csv_file}" for csv_file in os.listdir(data_directory) if csv_file.endswith(".csv")]
  diamond_dataframes: list[pd.DataFrame] = [pd.read_csv(csv_file) for csv_file in csv_list]
  df = pd.concat(diamond_dataframes)

  return df

In [34]:
def simplify_and_fix_colnames(df: pd.DataFrame) -> pd.DataFrame:
  simplified_column_names = [str.lower(col).replace(" ", "_") for col in df.columns.to_list()]

  df.columns = simplified_column_names

  df = df.rename(columns={"messurements": "measurements"})
  df = df.reset_index(drop=True)

  return df

In [41]:
def clean_and_seperate_measurement_entries(df: pd.DataFrame) -> pd.DataFrame:
  column_names = ["length", "width", "depth"]

  splitter_fn = lambda dimension: re.split(r"[^\d.]+", dimension)

  dimensions = list(map(splitter_fn, df.measurements.values))

  cols_to_add = list()

  for col_idx in range(3):
    col_to_add = list()

    for col in dimensions:
      col_to_add.append(col[col_idx])

    cols_to_add.append(np.asarray(col_to_add))

  cols_to_add = np.asarray(cols_to_add)

  df[column_names] = cols_to_add.T
  df[column_names] = df[column_names].astype(np.float32)

  return df

In [48]:
def clean_prices(price: np.ndarray) -> np.ndarray:
  price = price.astype(str).tolist()

  non_decimal = re.compile(r'[^\d.]+')

  price = [non_decimal.sub('', p) for p in price]

  return np.asarray(price)

In [49]:
df = load_and_combine_diamond_csv_files(DATA_DIRECTORY)
df = simplify_and_fix_colnames(df)
df = clean_and_seperate_measurement_entries(df)
df["price"] = clean_prices(df["price"].values) 

In [50]:
df.sample(frac=1.0).head(10)

Unnamed: 0,id,shape,weight,clarity,colour,cut,polish,symmetry,fluorescence,measurements,price,data_url,length,width,depth
586,1574489,CUSHION,0.77,VVS2,FANCY,EX,EX,VG,N,5.06-5.05×3.40,1856.86,https://capitalwholesalediamonds.com/product/0...,5.06,5.05,3.4
1065,1322822,EMERALD,0.29,VS2,E,EX,EX,VG,N,4.80-3.05×2.05,864.91,https://capitalwholesalediamonds.com/product/0...,4.8,3.05,2.05
587,1793042,CUSHION,0.55,VS2,E,EX,EX,EX,N,5.19-4.34×2.92,1857.85,https://capitalwholesalediamonds.com/product/0...,5.19,4.34,2.92
691,1781757,CUSHION,0.7,VS1,FANCY,EX,EX,GD,N,4.99-4.82×3.44,2026.14,https://capitalwholesalediamonds.com/product/0...,4.99,4.82,3.44
3026,1778230,ROUND,0.32,VVS1,K,EX,EX,EX,F,4.44-4.46×2.68,944.32,https://capitalwholesalediamonds.com/product/0...,4.44,4.46,2.68
724,1799595,CUSHION,0.71,SI2,I,EX,EX,EX,N,4.98-4.97×3.37,2088.89,https://capitalwholesalediamonds.com/product/0...,4.98,4.97,3.37
2742,1586068,ROUND,0.23,VS1,F,EX,EX,EX,N,3.93-3.95×2.47,935.79,https://capitalwholesalediamonds.com/product/0...,3.93,3.95,2.47
3293,1764900,ROUND,0.3,SI2,G,EX,EX,EX,F,4.36-4.39×2.60,925.14,https://capitalwholesalediamonds.com/product/0...,4.36,4.39,2.6
2917,1586192,ROUND,0.23,VS1,E,EX,EX,EX,N,3.94-3.96×2.41,941.25,https://capitalwholesalediamonds.com/product/0...,3.94,3.96,2.41
1304,1791042,EMERALD,0.3,VS1,D,VG,EX,VG,F,4.86-3.11×2.01,951.98,https://capitalwholesalediamonds.com/product/0...,4.86,3.11,2.01
