In [1]:
import polars as pl

# Load the diamonds dataset
print("Loading diamonds dataset...")
df = pl.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv')
print("Dataset loaded successfully!")

Loading diamonds dataset...
Dataset loaded successfully!


In [2]:
# Question 1: The CEO needs to know the scale of data you're analyzing.

print("The scale of the data is:")
print(df.describe())



The scale of the data is:
shape: (9, 9)
┌────────────┬──────────────┬───────────┬───────┬───┬────────┬──────────┬────────┬──────────────┐
│ statistic  ┆ Carat Weight ┆ Cut       ┆ Color ┆ … ┆ Polish ┆ Symmetry ┆ Report ┆ Price        │
│ ---        ┆ ---          ┆ ---       ┆ ---   ┆   ┆ ---    ┆ ---      ┆ ---    ┆ ---          │
│ str        ┆ f64          ┆ str       ┆ str   ┆   ┆ str    ┆ str      ┆ str    ┆ f64          │
╞════════════╪══════════════╪═══════════╪═══════╪═══╪════════╪══════════╪════════╪══════════════╡
│ count      ┆ 6000.0       ┆ 6000      ┆ 6000  ┆ … ┆ 6000   ┆ 6000     ┆ 6000   ┆ 6000.0       │
│ null_count ┆ 0.0          ┆ 0         ┆ 0     ┆ … ┆ 0      ┆ 0        ┆ 0      ┆ 0.0          │
│ mean       ┆ 1.33452      ┆ null      ┆ null  ┆ … ┆ null   ┆ null     ┆ null   ┆ 11791.579333 │
│ std        ┆ 0.475696     ┆ null      ┆ null  ┆ … ┆ null   ┆ null     ┆ null   ┆ 10184.350051 │
│ min        ┆ 0.75         ┆ Fair      ┆ D     ┆ … ┆ EX     ┆ EX       ┆ AGSL

In [None]:
# Question 2: List all the characteristics we record.

print("The characteristics we record are:")
print(df.columns)

In [3]:
# Question 3:

print("Each row represents:")
print(df.head(1))

Each row represents:
shape: (1, 8)
┌──────────────┬───────┬───────┬─────────┬────────┬──────────┬────────┬───────┐
│ Carat Weight ┆ Cut   ┆ Color ┆ Clarity ┆ Polish ┆ Symmetry ┆ Report ┆ Price │
│ ---          ┆ ---   ┆ ---   ┆ ---     ┆ ---    ┆ ---      ┆ ---    ┆ ---   │
│ f64          ┆ str   ┆ str   ┆ str     ┆ str    ┆ str      ┆ str    ┆ i64   │
╞══════════════╪═══════╪═══════╪═════════╪════════╪══════════╪════════╪═══════╡
│ 1.1          ┆ Ideal ┆ H     ┆ SI1     ┆ VG     ┆ EX       ┆ GIA    ┆ 5169  │
└──────────────┴───────┴───────┴─────────┴────────┴──────────┴────────┴───────┘


In [4]:
# Question 4
total_carat_weight = df['Carat Weight'].sum()
print(f"Total carat weight in inventory: {total_carat_weight:.2f} carats")

Total carat weight in inventory: 8007.12 carats


In [None]:
# Question 5
print("The variety of cuts in the inventory is:")
unique_cuts = df['Cut'].unique()
print(unique_cuts)
print(f"\nNumber of unique cuts: {len(unique_cuts)}")

In [None]:
# Question 6
min_price = df['Price'].min()
max_price = df['Price'].max()

print(f"Least valuable diamond price: ${min_price:.2f}")
print(f"Most valuable diamond price: ${max_price:.2f}")

In [None]:
# Question 7
print("Typical prices across color grades:")
average_price_by_color = df.group_by('Color').agg(pl.mean('Price')).sort('Color')
print(average_price_by_color)


In [None]:
# Question 8
print("Inventory breakdown by clarity grade:")
clarity_breakdown = df.group_by('Clarity').agg([
    pl.count().alias("Number of Diamonds"),
    pl.mean('Price').alias("Typical Price"),
    pl.sum('Carat Weight').alias("Total Carats")
]).sort('Clarity')

print(clarity_breakdown)

In [5]:
# Question 9
print("Pricing structure across cut-color combinations:")
pricing_by_cut_color = df.group_by(['Cut', 'Color']).agg(pl.mean('Price')).sort(['Cut', 'Color'])
print(pricing_by_cut_color)

Pricing structure across cut-color combinations:
shape: (30, 3)
┌───────────┬───────┬──────────────┐
│ Cut       ┆ Color ┆ Price        │
│ ---       ┆ ---   ┆ ---          │
│ str       ┆ str   ┆ f64          │
╞═══════════╪═══════╪══════════════╡
│ Fair      ┆ D     ┆ 6058.25      │
│ Fair      ┆ E     ┆ 5370.625     │
│ Fair      ┆ F     ┆ 6063.625     │
│ Fair      ┆ G     ┆ 7345.52381   │
│ Fair      ┆ H     ┆ 5908.5       │
│ …         ┆ …     ┆ …            │
│ Very Good ┆ E     ┆ 12101.910217 │
│ Very Good ┆ F     ┆ 12413.905495 │
│ Very Good ┆ G     ┆ 12354.013841 │
│ Very Good ┆ H     ┆ 10056.106132 │
│ Very Good ┆ I     ┆ 8930.031332  │
└───────────┴───────┴──────────────┘


In [7]:
# Question 10
print("Top 5 most valuable inventory segments by color-cut combination:")
top_5_valuable_segments = df.group_by(['Color', 'Cut']).agg([
    pl.sum('Price').alias("Total Value"),
    pl.len().alias("Number of Diamonds")
]).sort("Total Value", descending=True).head(5)

print(top_5_valuable_segments)

Top 5 most valuable inventory segments by color-cut combination:
shape: (5, 4)
┌───────┬───────────┬─────────────┬────────────────────┐
│ Color ┆ Cut       ┆ Total Value ┆ Number of Diamonds │
│ ---   ┆ ---       ┆ ---         ┆ ---                │
│ str   ┆ str       ┆ i64         ┆ u32                │
╞═══════╪═══════════╪═════════════╪════════════════════╡
│ G     ┆ Ideal     ┆ 9363514     ┆ 690                │
│ G     ┆ Very Good ┆ 7140620     ┆ 578                │
│ F     ┆ Very Good ┆ 5648327     ┆ 455                │
│ F     ┆ Ideal     ┆ 5346782     ┆ 363                │
│ H     ┆ Ideal     ┆ 5279687     ┆ 458                │
└───────┴───────────┴─────────────┴────────────────────┘
