In [2]:
import polars as pl

# Load the diamonds dataset
print("Loading diamonds dataset...")
df = pl.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv')
print("Dataset loaded successfully!")

Loading diamonds dataset...
Dataset loaded successfully!


In [3]:
print(f"Rows (diamonds): {df.height}")
print(f"Columns (fields): {df.width}")

Rows (diamonds): 6000
Columns (fields): 8


In [4]:
for name, dtype in zip(df.columns, df.dtypes):
    print(f"{name}: {dtype}")

Carat Weight: Float64
Cut: String
Color: String
Clarity: String
Polish: String
Symmetry: String
Report: String
Price: Int64


In [5]:
# Each row represents a single diamond in the inventory,
# with all its attributes (cut, color, clarity, carat, price, dimensions).

In [7]:
agg = df.select(
    total_carats = pl.col("Carat Weight").sum(),
    total_grams = pl.col("Carat Weight").sum() * 0.2,
    total_kilograms = (pl.col("Carat Weight").sum() * 0.2) / 1000
)
print(agg)

shape: (1, 3)
┌──────────────┬─────────────┬─────────────────┐
│ total_carats ┆ total_grams ┆ total_kilograms │
│ ---          ┆ ---         ┆ ---             │
│ f64          ┆ f64         ┆ f64             │
╞══════════════╪═════════════╪═════════════════╡
│ 8007.12      ┆ 1601.424    ┆ 1.601424        │
└──────────────┴─────────────┴─────────────────┘


In [9]:
cuts_df = df.select(pl.col("Cut").n_unique().alias("unique_cuts"))
cut_list = df.get_column("Cut").unique().sort().to_list()

print(cuts_df)
print("Cuts:", cut_list)

shape: (1, 1)
┌─────────────┐
│ unique_cuts │
│ ---         │
│ u32         │
╞═════════════╡
│ 5           │
└─────────────┘
Cuts: ['Fair', 'Good', 'Ideal', 'Signature-Ideal', 'Very Good']


In [11]:
price_col = "price" if "price" in df.columns else "Price"

max_price = df.select(pl.col(price_col).max()).item()
min_price = df.select(pl.col(price_col).min()).item()

print(f"Max price: {max_price}")
print(df.filter(pl.col(price_col) == max_price))

print(f"Min price: {min_price}")
print(df.filter(pl.col(price_col) == min_price))

Max price: 101561
shape: (1, 8)
┌──────────────┬───────┬───────┬─────────┬────────┬──────────┬────────┬────────┐
│ Carat Weight ┆ Cut   ┆ Color ┆ Clarity ┆ Polish ┆ Symmetry ┆ Report ┆ Price  │
│ ---          ┆ ---   ┆ ---   ┆ ---     ┆ ---    ┆ ---      ┆ ---    ┆ ---    │
│ f64          ┆ str   ┆ str   ┆ str     ┆ str    ┆ str      ┆ str    ┆ i64    │
╞══════════════╪═══════╪═══════╪═════════╪════════╪══════════╪════════╪════════╡
│ 2.79         ┆ Ideal ┆ D     ┆ IF      ┆ EX     ┆ EX       ┆ GIA    ┆ 101561 │
└──────────────┴───────┴───────┴─────────┴────────┴──────────┴────────┴────────┘
Min price: 2184
shape: (1, 8)
┌──────────────┬──────┬───────┬─────────┬────────┬──────────┬────────┬───────┐
│ Carat Weight ┆ Cut  ┆ Color ┆ Clarity ┆ Polish ┆ Symmetry ┆ Report ┆ Price │
│ ---          ┆ ---  ┆ ---   ┆ ---     ┆ ---    ┆ ---      ┆ ---    ┆ ---   │
│ f64          ┆ str  ┆ str   ┆ str     ┆ str    ┆ str      ┆ str    ┆ i64   │
╞══════════════╪══════╪═══════╪═════════╪════════╪═════

In [13]:
by_color = (
    df.group_by("Color")
      .agg([
          pl.col("Price").median().alias("median_price"),
          pl.col("Price").mean().alias("mean_price"),
          pl.len().alias("n")
      ])
      .sort("Color")
)
print(by_color)

shape: (6, 4)
┌───────┬──────────────┬──────────────┬──────┐
│ Color ┆ median_price ┆ mean_price   ┆ n    │
│ ---   ┆ ---          ┆ ---          ┆ ---  │
│ str   ┆ f64          ┆ f64          ┆ u32  │
╞═══════╪══════════════╪══════════════╪══════╡
│ D     ┆ 7567.0       ┆ 15255.783661 ┆ 661  │
│ E     ┆ 7059.5       ┆ 11539.190231 ┆ 778  │
│ F     ┆ 8860.0       ┆ 12712.241856 ┆ 1013 │
│ G     ┆ 8800.0       ┆ 12520.050633 ┆ 1501 │
│ H     ┆ 7169.0       ┆ 10487.347544 ┆ 1079 │
│ I     ┆ 6655.0       ┆ 8989.636364  ┆ 968  │
└───────┴──────────────┴──────────────┴──────┘


In [15]:
clarity_summary = (
    df.group_by("Clarity")
      .agg([
          pl.len().alias("count"),
          pl.col("Price").median().alias("median_price"),
          pl.col("Carat Weight").sum().alias("total_carats")
      ])
      .sort("Clarity")
)

print(clarity_summary)

shape: (7, 4)
┌─────────┬───────┬──────────────┬──────────────┐
│ Clarity ┆ count ┆ median_price ┆ total_carats │
│ ---     ┆ ---   ┆ ---          ┆ ---          │
│ str     ┆ u32   ┆ f64          ┆ f64          │
╞═════════╪═══════╪══════════════╪══════════════╡
│ FL      ┆ 4     ┆ 62371.5      ┆ 7.87         │
│ IF      ┆ 219   ┆ 12647.0      ┆ 316.0        │
│ SI1     ┆ 2059  ┆ 5417.0       ┆ 2563.69      │
│ VS1     ┆ 1192  ┆ 9245.0       ┆ 1682.74      │
│ VS2     ┆ 1575  ┆ 7568.0       ┆ 2170.81      │
│ VVS1    ┆ 285   ┆ 11142.0      ┆ 389.43       │
│ VVS2    ┆ 666   ┆ 10266.0      ┆ 876.58       │
└─────────┴───────┴──────────────┴──────────────┘


In [18]:
combos = (
    df.group_by(["Cut", "Color"])
      .agg([
          pl.len().alias("count"),
          pl.col("Price").median().alias("median_price"),
      ])
      .sort("median_price", descending=True)
)
print(combos)

print("\nTop combos with at least 20 samples:\n")
print(combos.filter(pl.col("count") >= 20))

shape: (30, 4)
┌─────────────────┬───────┬───────┬──────────────┐
│ Cut             ┆ Color ┆ count ┆ median_price │
│ ---             ┆ ---   ┆ ---   ┆ ---          │
│ str             ┆ str   ┆ u32   ┆ f64          │
╞═════════════════╪═══════╪═══════╪══════════════╡
│ Signature-Ideal ┆ D     ┆ 30    ┆ 11712.5      │
│ Signature-Ideal ┆ E     ┆ 35    ┆ 11014.0      │
│ Signature-Ideal ┆ F     ┆ 38    ┆ 10433.0      │
│ Ideal           ┆ F     ┆ 363   ┆ 10360.0      │
│ Ideal           ┆ D     ┆ 280   ┆ 10338.0      │
│ …               ┆ …     ┆ …     ┆ …            │
│ Fair            ┆ E     ┆ 32    ┆ 4908.5       │
│ Fair            ┆ G     ┆ 21    ┆ 4646.0       │
│ Fair            ┆ F     ┆ 24    ┆ 4628.0       │
│ Fair            ┆ H     ┆ 24    ┆ 4358.0       │
│ Fair            ┆ I     ┆ 16    ┆ 3801.0       │
└─────────────────┴───────┴───────┴──────────────┘

Top combos with at least 20 samples:

shape: (28, 4)
┌─────────────────┬───────┬───────┬──────────────┐
│ Cut        

In [19]:
segments = (
    df.group_by(["Color", "Cut"])
      .agg([
          pl.len().alias("count"),
          pl.col("Price").sum().alias("total_worth"),
          pl.col("Price").median().alias("typical_price"),
      ])
      .sort("total_worth", descending=True)
      .head(5)
)
print(segments)

shape: (5, 5)
┌───────┬───────────┬───────┬─────────────┬───────────────┐
│ Color ┆ Cut       ┆ count ┆ total_worth ┆ typical_price │
│ ---   ┆ ---       ┆ ---   ┆ ---         ┆ ---           │
│ str   ┆ str       ┆ u32   ┆ i64         ┆ f64           │
╞═══════╪═══════════╪═══════╪═════════════╪═══════════════╡
│ G     ┆ Ideal     ┆ 690   ┆ 9363514     ┆ 9510.5        │
│ G     ┆ Very Good ┆ 578   ┆ 7140620     ┆ 8249.0        │
│ F     ┆ Very Good ┆ 455   ┆ 5648327     ┆ 8330.0        │
│ F     ┆ Ideal     ┆ 363   ┆ 5346782     ┆ 10360.0       │
│ H     ┆ Ideal     ┆ 458   ┆ 5279687     ┆ 8014.0        │
└───────┴───────────┴───────┴─────────────┴───────────────┘
