In [1]:
import polars as pl

# Load the diamonds dataset
print("Loading diamonds dataset...")
df = pl.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv')
print("Dataset loaded successfully!")

Loading diamonds dataset...
Dataset loaded successfully!


In [2]:
# Question 1:
num_rows = df.height
num_cols = df.width
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")


Number of rows: 6000
Number of columns: 8


In [4]:
# Question 2:
print("For every diamond, we track these characteristics:")
print(df.columns)


For every diamond, we track these characteristics:
['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']


In [None]:
# Question 3:
# Each row represents: one unique diamond being tracked in the inventory with all of its characteristics.

In [5]:
# Question 4:
total_carats = df["Carat Weight"].sum()
print(f"The inventory contains {total_carats:.2f} carats of diamonds.")


The inventory contains 8007.12 carats of diamonds.


In [6]:
# Question 5:
unique_cuts = df["Cut"].unique()
num_cuts = unique_cuts.len()
print(f"We have {num_cuts} distinct cut varieties: {list(unique_cuts)}.")


We have 5 distinct cut varieties: ['Fair', 'Good', 'Signature-Ideal', 'Ideal', 'Very Good'].


In [7]:
# Question 6:
max_price = df["Price"].max()
min_price = df["Price"].min()

print(f"Our most valuable diamond is priced at ${max_price:,.2f}, "
      f"while our least valuable diamond is priced at ${min_price:,.2f}.")


Our most valuable diamond is priced at $101,561.00, while our least valuable diamond is priced at $2,184.00.


In [9]:
# Question 7:
avg_prices = df.group_by("Color").agg([
    pl.mean("Price").alias("Average Price")
])

print(avg_prices)

shape: (6, 2)
┌───────┬───────────────┐
│ Color ┆ Average Price │
│ ---   ┆ ---           │
│ str   ┆ f64           │
╞═══════╪═══════════════╡
│ H     ┆ 10487.347544  │
│ E     ┆ 11539.190231  │
│ G     ┆ 12520.050633  │
│ F     ┆ 12712.241856  │
│ D     ┆ 15255.783661  │
│ I     ┆ 8989.636364   │
└───────┴───────────────┘


In [None]:
# Question 7:
# New grain = Each row now represents a single color grade, showing the average diamond price for that color across the entire inventory.


In [12]:
# Question 8:
clarity_summary = df.group_by("Clarity").agg([
    pl.count().alias("Number of Diamonds"),
    pl.mean("Price").alias("Average Price"),
    pl.sum("Carat Weight").alias("Total Carats")
])

print(clarity_summary)

shape: (7, 4)
┌─────────┬────────────────────┬───────────────┬──────────────┐
│ Clarity ┆ Number of Diamonds ┆ Average Price ┆ Total Carats │
│ ---     ┆ ---                ┆ ---           ┆ ---          │
│ str     ┆ u32                ┆ f64           ┆ f64          │
╞═════════╪════════════════════╪═══════════════╪══════════════╡
│ VVS1    ┆ 285                ┆ 16845.680702  ┆ 389.43       │
│ VVS2    ┆ 666                ┆ 14142.177177  ┆ 876.58       │
│ FL      ┆ 4                  ┆ 63776.0       ┆ 7.87         │
│ SI1     ┆ 2059               ┆ 8018.864012   ┆ 2563.69      │
│ IF      ┆ 219                ┆ 22105.844749  ┆ 316.0        │
│ VS1     ┆ 1192               ┆ 13694.113255  ┆ 1682.74      │
│ VS2     ┆ 1575               ┆ 11809.053333  ┆ 2170.81      │
└─────────┴────────────────────┴───────────────┴──────────────┘


  pl.count().alias("Number of Diamonds"),


In [15]:
# Question 9:
cut_color_summary = df.group_by(["Cut", "Color"]).agg([
    pl.mean("Price").alias("Average Price")
])

print(cut_color_summary)

shape: (30, 3)
┌─────────────────┬───────┬───────────────┐
│ Cut             ┆ Color ┆ Average Price │
│ ---             ┆ ---   ┆ ---           │
│ str             ┆ str   ┆ f64           │
╞═════════════════╪═══════╪═══════════════╡
│ Ideal           ┆ E     ┆ 12647.107914  │
│ Very Good       ┆ G     ┆ 12354.013841  │
│ Very Good       ┆ I     ┆ 8930.031332   │
│ Fair            ┆ D     ┆ 6058.25       │
│ Signature-Ideal ┆ F     ┆ 13247.947368  │
│ …               ┆ …     ┆ …             │
│ Ideal           ┆ H     ┆ 11527.700873  │
│ Signature-Ideal ┆ E     ┆ 11261.914286  │
│ Ideal           ┆ I     ┆ 9459.588378   │
│ Very Good       ┆ F     ┆ 12413.905495  │
│ Very Good       ┆ H     ┆ 10056.106132  │
└─────────────────┴───────┴───────────────┘


In [None]:
# Question 9:
# New grain = Each row now represents a unique cut–color combination, with the average price of diamonds in that group.

In [17]:
# Question 10:
top_segments = (
    df.group_by(["Cut", "Color"]).agg([
        pl.sum("Price").alias("Total Value"),
        pl.count().alias("Number of Diamonds")
    ])
    .sort("Total Value", descending=True)
    .head(5)
)

print(top_segments)

shape: (5, 4)
┌───────────┬───────┬─────────────┬────────────────────┐
│ Cut       ┆ Color ┆ Total Value ┆ Number of Diamonds │
│ ---       ┆ ---   ┆ ---         ┆ ---                │
│ str       ┆ str   ┆ i64         ┆ u32                │
╞═══════════╪═══════╪═════════════╪════════════════════╡
│ Ideal     ┆ G     ┆ 9363514     ┆ 690                │
│ Very Good ┆ G     ┆ 7140620     ┆ 578                │
│ Very Good ┆ F     ┆ 5648327     ┆ 455                │
│ Ideal     ┆ F     ┆ 5346782     ┆ 363                │
│ Ideal     ┆ H     ┆ 5279687     ┆ 458                │
└───────────┴───────┴─────────────┴────────────────────┘


  pl.count().alias("Number of Diamonds")
