In [8]:
import polars as pl

# Load dataset
print("Loading diamonds dataset...")
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")
print("Dataset loaded successfully!")

Loading diamonds dataset...
Dataset loaded successfully!


In [9]:
# Section 1:
# Question 1: How extensive is our inventory?
print("Q1:", df.shape)

Q1: (6000, 8)


In [10]:
# Question 2: What information do we track?
print("Q2:", df.columns)

Q2: ['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']


In [11]:
# Question 3: What does one row represent?
print("Q3: Each row represents a single diamond in our inventory with its characteristics and price.")


Q3: Each row represents a single diamond in our inventory with its characteristics and price.


In [12]:
# Section 2:
# Question 4: Total carats in inventory
total_carats = df["Carat Weight"].sum()
print("Q4: Total carats =", total_carats)

Q4: Total carats = 8007.120000000001


In [13]:
# Question 5: Variety of cuts
cuts = df["Cut"].unique()
print("Q5: Unique cuts =", cuts)

Q5: Unique cuts = shape: (5,)
Series: 'Cut' [str]
[
	"Very Good"
	"Ideal"
	"Signature-Ideal"
	"Fair"
	"Good"
]


In [14]:
# Question 6: Most and least valuable diamond
max_price = df["Price"].max()
min_price = df["Price"].min()
print("Q6: Most valuable diamond price =", max_price)
print("Q6: Least valuable diamond price =", min_price)

Q6: Most valuable diamond price = 101561
Q6: Least valuable diamond price = 2184


In [15]:
# Question 7: Typical prices by color
price_by_color = df.group_by("Color").agg(pl.col("Price").mean().alias("avg_price"))
print("Q7:\n", price_by_color)
print("Reflection: Now each row represents a color grade with its average price.")

Q7:
 shape: (6, 2)
┌───────┬──────────────┐
│ Color ┆ avg_price    │
│ ---   ┆ ---          │
│ str   ┆ f64          │
╞═══════╪══════════════╡
│ I     ┆ 8989.636364  │
│ D     ┆ 15255.783661 │
│ G     ┆ 12520.050633 │
│ E     ┆ 11539.190231 │
│ H     ┆ 10487.347544 │
│ F     ┆ 12712.241856 │
└───────┴──────────────┘
Reflection: Now each row represents a color grade with its average price.


In [20]:
# Section 3:
# Question 8: Clarity breakdown
clarity_info = df.group_by("Clarity").agg([
    pl.len().alias("count"),
    pl.col("Price").mean().alias("avg_price"),
    pl.col("Carat Weight").sum().alias("total_carats")
])
print("Q8:\n", clarity_info)

Q8:
 shape: (7, 4)
┌─────────┬───────┬──────────────┬──────────────┐
│ Clarity ┆ count ┆ avg_price    ┆ total_carats │
│ ---     ┆ ---   ┆ ---          ┆ ---          │
│ str     ┆ u32   ┆ f64          ┆ f64          │
╞═════════╪═══════╪══════════════╪══════════════╡
│ VVS2    ┆ 666   ┆ 14142.177177 ┆ 876.58       │
│ VS2     ┆ 1575  ┆ 11809.053333 ┆ 2170.81      │
│ IF      ┆ 219   ┆ 22105.844749 ┆ 316.0        │
│ VS1     ┆ 1192  ┆ 13694.113255 ┆ 1682.74      │
│ VVS1    ┆ 285   ┆ 16845.680702 ┆ 389.43       │
│ FL      ┆ 4     ┆ 63776.0      ┆ 7.87         │
│ SI1     ┆ 2059  ┆ 8018.864012  ┆ 2563.69      │
└─────────┴───────┴──────────────┴──────────────┘


In [17]:
# Question 9: Pricing structure by cut and color
cut_color_price = df.group_by(["Cut", "Color"]).agg(pl.col("Price").mean().alias("avg_price"))
print("Q9:\n", cut_color_price)

print("Reflection: Now each row is a specific cut-color combination with its average price.")

Q9:
 shape: (30, 3)
┌─────────────────┬───────┬──────────────┐
│ Cut             ┆ Color ┆ avg_price    │
│ ---             ┆ ---   ┆ ---          │
│ str             ┆ str   ┆ f64          │
╞═════════════════╪═══════╪══════════════╡
│ Signature-Ideal ┆ D     ┆ 19823.1      │
│ Very Good       ┆ D     ┆ 13218.826415 │
│ Fair            ┆ H     ┆ 5908.5       │
│ Ideal           ┆ H     ┆ 11527.700873 │
│ Very Good       ┆ I     ┆ 8930.031332  │
│ …               ┆ …     ┆ …            │
│ Good            ┆ E     ┆ 8969.545455  │
│ Good            ┆ H     ┆ 9535.132812  │
│ Ideal           ┆ F     ┆ 14729.426997 │
│ Ideal           ┆ I     ┆ 9459.588378  │
│ Signature-Ideal ┆ H     ┆ 9112.688889  │
└─────────────────┴───────┴──────────────┘
Reflection: Now each row is a specific cut-color combination with its average price.


In [19]:
# Question 10: Top 5 most valuable color-cut combinations
top_value_segments = df.group_by(["Color", "Cut"]).agg([
    pl.sum("Price").alias("total_value"),
    pl.len().alias("count_diamonds")
]).sort("total_value", descending=True).head(5)
print("Q10:\n", top_value_segments)

Q10:
 shape: (5, 4)
┌───────┬───────────┬─────────────┬────────────────┐
│ Color ┆ Cut       ┆ total_value ┆ count_diamonds │
│ ---   ┆ ---       ┆ ---         ┆ ---            │
│ str   ┆ str       ┆ i64         ┆ u32            │
╞═══════╪═══════════╪═════════════╪════════════════╡
│ G     ┆ Ideal     ┆ 9363514     ┆ 690            │
│ G     ┆ Very Good ┆ 7140620     ┆ 578            │
│ F     ┆ Very Good ┆ 5648327     ┆ 455            │
│ F     ┆ Ideal     ┆ 5346782     ┆ 363            │
│ H     ┆ Ideal     ┆ 5279687     ┆ 458            │
└───────┴───────────┴─────────────┴────────────────┘
