In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data/diamond.csv')

df_org = df.copy()


df.head(5)


# Data description


### Copied from source of dataset:
Content price price in US dollars ($326--$18,823)

carat weight of the diamond (0.2--5.01)

cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color diamond colour, from J (worst) to D (best)

clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

x length in mm (0--10.74)

y width in mm (0--58.9)

z depth in mm (0--31.8)

depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

table width of top of diamond relative to widest point (43--95)

https://www.openml.org/search?type=data&sort=runs&id=42225&status=active

# Endcoding

### Encoding color:

We will encode them so high values = better and low values = worse

In [None]:
df['color'].unique()

In [None]:
color_mapping = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
df['color'] = df['color'].map(color_mapping)


### Encoding Cut Quality

From the source describtion we are told that the cut quality is ordered from Fair to ideal

Fair, Good, Very Good, Premium, Ideal

In [None]:
df['cut'].unique()

In [None]:
cut_mapping = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
df['cut'] = df['cut'].map(cut_mapping)


### Encoding clarity

(I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

In [None]:
df['clarity'].unique()

In [None]:
clarity_mapping = {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}
df['clarity'] = df['clarity'].map(clarity_mapping)

In [None]:
df[['cut', 'clarity', 'color']].head()

In [None]:
import matplotlib.pyplot as plt

df[['carat', 'depth', 'table', 'price', 'x', 'z','y']].hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
print(df.describe().to_latex())

# Transformation:

In [None]:
df['log_carat'] = np.log(df['carat'])
df['log_price'] = np.log(df['price'])

df[['price', 'carat',  'log_price','log_carat']].hist(bins=50, figsize=(20, 15))
plt.show()


In [None]:
import seaborn as sns

corr_matrix = df.corr()

# Draw the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df_sampled = df.sample(n=1000, random_state=42)

columns_to_plot = ['log_price', 'cut', 'clarity', 'color', 'x','z', 'y']

# Used co pilot to help with creating noise.
df_sampled['cut'] = df_sampled['cut'] + np.random.normal(0, 0.1, df_sampled['cut'].shape)
df_sampled['clarity'] = df_sampled['clarity'] + np.random.normal(0, 0.1, df_sampled['clarity'].shape)
df_sampled['color'] = df_sampled['color'] + np.random.normal(0, 0.1, df_sampled['color'].shape)

sns.pairplot(df_sampled[columns_to_plot], plot_kws={'alpha':0.5})
plt.show()


In [None]:
print(df[['log_price', 'log_carat','cut', 'clarity', 'color', 'x', 'z', 'y']]
      .describe()
      .to_latex(float_format="%.3f"))

In [None]:
df[['cut', 'clarity', 'color', 'log_price', 'log_carat' ,'x','z', 'y']].describe()

In [None]:
# We used a code expample for IM ML excercises 

from scipy.linalg import svd 

df_norm = (df - df.mean()) / df.std(ddof=1)


Y = df_norm.drop(['price', 'log_price', 'carat'], axis=1).values

N, M = Y.shape



U, S, Vh = svd(Y, full_matrices=False) 
V = Vh.T

Z = Y @ V

# Compute variance explained by principal components
rho = (S * S) / (S * S).sum()
print(rho[0:5])

In [None]:
# We used a code expample for IM ML excercises 

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rho) + 1), rho, 'o-', linewidth=2, markersize=8)
plt.title('Explained Variance by Principal Components')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.grid(True)
plt.show()

In [None]:
# Used copilot to help with structuring the plot.

sample_indices = np.random.choice(Z.shape[0], size=1000, replace=False)

print(len(sample_indices))

Z_sample = Z[sample_indices, :]

print(Z_sample)

colors = df['cut'].iloc[sample_indices]

unique_colors = ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good']
unique_colors = [1, 2, 3, 4, 5]
color_map = {color: idx / (len(unique_colors) - 1) for idx, color in enumerate(unique_colors)}
color_values = colors.map(color_map)

scatter = plt.scatter(Z_sample[:, 0], Z_sample[:, 1], c=color_values, cmap='viridis', alpha=0.8)
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('PCA 1 vs PCA 2, Color indicating type of cut')

# Create legend with color labels
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=plt.cm.viridis(color_map[color]), markersize=10) for color in unique_colors]
plt.legend(handles, unique_colors, title="Cut", loc="best")

plt.show()

In [None]:
# Used copilot to help with structuring the plot.

import numpy as np
import matplotlib.pyplot as plt

sample_indices = np.random.choice(Z.shape[0], size=1000, replace=False)

Z_sample = Z[sample_indices, :]
colors = df['cut'].iloc[sample_indices]
prices = df['log_price'].iloc[sample_indices]

unique_colors = [1,2,3,4,5]
color_map = {color: idx / (len(unique_colors) - 1) for idx, color in enumerate(unique_colors)}
color_values = colors.map(color_map)


fig, axes = plt.subplots(1, 2, figsize=(14, 6))  # 1 row, 2 columns

scatter1 = axes[0].scatter(Z_sample[:, 0], Z_sample[:, 1], c=color_values, cmap='viridis', alpha=0.8)
axes[0].set_xlabel('PCA 1')
axes[0].set_ylabel('PCA 2')
axes[0].set_title('PCA 1 vs PCA 2 (Cut Type)')


handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=plt.cm.viridis(color_map[color]), markersize=10) for color in unique_colors]
axes[0].legend(handles, unique_colors, title="Cut", loc="best")


scatter2 = axes[1].scatter(Z_sample[:, 0], Z_sample[:, 1], c=prices, cmap='viridis', alpha=0.8)
axes[1].set_xlabel('PCA 1')
axes[1].set_ylabel('PCA 2')
axes[1].set_title('PCA 1 vs PCA 2 (Price)')


cbar = fig.colorbar(scatter2, ax=axes[1])
cbar.set_label('Log Price')


plt.tight_layout()
plt.show()
