In [28]:
#1 import polars as pl

# Load dataset
print("Loading diamonds dataset...")
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")
print("Dataset loaded successfully!\n")

# ​​​ Check overall inventory size
num_rows, num_columns = df.shape
print(f"Total records (rows): {num_rows}")
print(f"Total fields (columns): {num_columns}\n")

# ​​​ Optionally, view a few sample rows
print("A preview of the first few records:")
print(df.head(5))

Loading diamonds dataset...
Dataset loaded successfully!

Total records (rows): 6000
Total fields (columns): 8

A preview of the first few records:
shape: (5, 8)
┌──────────────┬───────┬───────┬─────────┬────────┬──────────┬────────┬───────┐
│ Carat Weight ┆ Cut   ┆ Color ┆ Clarity ┆ Polish ┆ Symmetry ┆ Report ┆ Price │
│ ---          ┆ ---   ┆ ---   ┆ ---     ┆ ---    ┆ ---      ┆ ---    ┆ ---   │
│ f64          ┆ str   ┆ str   ┆ str     ┆ str    ┆ str      ┆ str    ┆ i64   │
╞══════════════╪═══════╪═══════╪═════════╪════════╪══════════╪════════╪═══════╡
│ 1.1          ┆ Ideal ┆ H     ┆ SI1     ┆ VG     ┆ EX       ┆ GIA    ┆ 5169  │
│ 0.83         ┆ Ideal ┆ H     ┆ VS1     ┆ ID     ┆ ID       ┆ AGSL   ┆ 3470  │
│ 0.85         ┆ Ideal ┆ H     ┆ SI1     ┆ EX     ┆ EX       ┆ GIA    ┆ 3183  │
│ 0.91         ┆ Ideal ┆ E     ┆ SI1     ┆ VG     ┆ VG       ┆ GIA    ┆ 4370  │
│ 0.83         ┆ Ideal ┆ G     ┆ SI1     ┆ EX     ┆ EX       ┆ GIA    ┆ 3171  │
└──────────────┴───────┴───────┴──────

In [2]:
#2 import polars as pl

df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")
print("Tracked characteristics (columns):")
print(df.columns)

Tracked characteristics (columns):
['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']


Each row in your diamonds inventory database corresponds to one individual diamond in stock. Think of it like a diamond’s digital identity card: every diamond is described by multiple key attributes—covering its size, aesthetics, physical dimensions, and price.

Available columns: ['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']
Column 'carat' not found — check spelling or column cleanup.
Column 'carat' not found in DataFrame.


  except pl.ColumnNotFoundError:


ModuleNotFoundError: No module named 'janitor'

In [5]:
%pip install -q janitor polars

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.8/259.8 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for janitor (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0m

In [7]:
#4 import polars as pl
import janitor.polars

# Load dataset
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")

# 1. Check existing columns
print("Available columns:", df.columns)

# 2. Handle column safely
col = df.get_column("Carat Weight", default=None)
if col is None:
    print("Column 'Carat Weight' not found — check spelling or column cleanup.")
else:
    print("Column 'Carat Weight' found — proceeding with analysis.")

# 3. Or catch explicitly:
try:
    df.select("Carat Weight")
    print("Successfully selected 'Carat Weight'.")
except pl.ColumnNotFoundError:
    print("Column 'Carat Weight' not found in DataFrame.")

# 4. Optional: Clean names for consistency
df = df.clean_names(remove_special=True, case_type="lower")
print("Cleaned columns:", df.columns)

Available columns: ['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']
Column 'Carat Weight' found — proceeding with analysis.
Successfully selected 'Carat Weight'.

Total carats in inventory: 8007.120000000001
Total weight in grams: 1601.42 g
Total weight in kilograms: 1.601 kg


In [9]:
#5 import polars as pl

# Load the dataset
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")

# Confirm actual column names
print("Columns available:", df.columns)

# Use the correct, case‑sensitive column name
shape_col = "Cut"  # Adjusted to match your dataset exactly

if shape_col in df.columns:
    unique_shapes = df[shape_col].unique()
    print("Unique diamond shapes (or cuts) in our inventory:", unique_shapes)
    print(f"Total variety of cuts (distinct values): {unique_shapes.len()}")
else:
    print(f"Error: Column '{shape_col}' not found. Please check column naming or capitalization.")

Columns available: ['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']
Unique diamond shapes (or cuts) in our inventory: shape: (5,)
Series: 'Cut' [str]
[
	"Ideal"
	"Signature-Ideal"
	"Very Good"
	"Fair"
	"Good"
]
Total variety of cuts (distinct values): 5


In [11]:
#6 import polars as pl

# Load the diamonds dataset
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")

# Confirm available columns (case-sensitive)
print("Available columns:", df.columns)

# Most valuable diamond (highest price)
most_expensive = df.sort("Price", descending=True).head(1)
print("Most valuable diamond:")
print(most_expensive)

# Least valuable diamond (lowest price)
least_expensive = df.sort("Price", descending=False).head(1)
print("Least valuable diamond:")
print(least_expensive)

Available columns: ['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']
Most valuable diamond:
shape: (1, 8)
┌──────────────┬───────┬───────┬─────────┬────────┬──────────┬────────┬────────┐
│ Carat Weight ┆ Cut   ┆ Color ┆ Clarity ┆ Polish ┆ Symmetry ┆ Report ┆ Price  │
│ ---          ┆ ---   ┆ ---   ┆ ---     ┆ ---    ┆ ---      ┆ ---    ┆ ---    │
│ f64          ┆ str   ┆ str   ┆ str     ┆ str    ┆ str      ┆ str    ┆ i64    │
╞══════════════╪═══════╪═══════╪═════════╪════════╪══════════╪════════╪════════╡
│ 2.79         ┆ Ideal ┆ D     ┆ IF      ┆ EX     ┆ EX       ┆ GIA    ┆ 101561 │
└──────────────┴───────┴───────┴─────────┴────────┴──────────┴────────┴────────┘
Least valuable diamond:
shape: (1, 8)
┌──────────────┬──────┬───────┬─────────┬────────┬──────────┬────────┬───────┐
│ Carat Weight ┆ Cut  ┆ Color ┆ Clarity ┆ Polish ┆ Symmetry ┆ Report ┆ Price │
│ ---          ┆ ---  ┆ ---   ┆ ---     ┆ ---    ┆ ---      ┆ ---    ┆ ---   │
│ f64          ┆ 

In [30]:
7# import polars as pl

# Load the dataset
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")

# Confirm available columns
print("Available columns:", df.columns)

# Get the most valuable diamond
most_expensive = df.sort("Price", descending=True).head(1)
print("Most valuable diamond:")
print(most_expensive)

# Get the least valuable diamond
least_expensive = df.sort("Price", descending=False).head(1)
print("Least valuable diamond:")
print(least_expensive)

Available columns: ['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']
Most valuable diamond:
shape: (1, 8)
┌──────────────┬───────┬───────┬─────────┬────────┬──────────┬────────┬────────┐
│ Carat Weight ┆ Cut   ┆ Color ┆ Clarity ┆ Polish ┆ Symmetry ┆ Report ┆ Price  │
│ ---          ┆ ---   ┆ ---   ┆ ---     ┆ ---    ┆ ---      ┆ ---    ┆ ---    │
│ f64          ┆ str   ┆ str   ┆ str     ┆ str    ┆ str      ┆ str    ┆ i64    │
╞══════════════╪═══════╪═══════╪═════════╪════════╪══════════╪════════╪════════╡
│ 2.79         ┆ Ideal ┆ D     ┆ IF      ┆ EX     ┆ EX       ┆ GIA    ┆ 101561 │
└──────────────┴───────┴───────┴─────────┴────────┴──────────┴────────┴────────┘
Least valuable diamond:
shape: (1, 8)
┌──────────────┬──────┬───────┬─────────┬────────┬──────────┬────────┬───────┐
│ Carat Weight ┆ Cut  ┆ Color ┆ Clarity ┆ Polish ┆ Symmetry ┆ Report ┆ Price │
│ ---          ┆ ---  ┆ ---   ┆ ---     ┆ ---    ┆ ---      ┆ ---    ┆ ---   │
│ f64          ┆ 

In [29]:
8# import polars as pl

# Load the diamonds dataset
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")

# Verify available columns to ensure correct case-sensitive usage
print("Available columns:", df.columns)

# Group by 'Cut', 'Color', and 'Report', and calculate the required statistics
diamond_stats = (
    df.group_by(["Cut", "Color", "Report"])
    .agg([
        pl.count().alias("Diamond Count"),
        pl.col("Price").mean().alias("Avg Price per Carat"),
        pl.col("Carat Weight").sum().alias("Total Carats")
    ])
    .sort("Avg Price per Carat", descending=True)
)

# Display the statistics
print("Diamond Characteristics Breakdown:")
print(diamond_stats)

Available columns: ['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report', 'Price']
Diamond Characteristics Breakdown:
shape: (58, 6)
┌─────────────────┬───────┬────────┬───────────────┬─────────────────────┬──────────────┐
│ Cut             ┆ Color ┆ Report ┆ Diamond Count ┆ Avg Price per Carat ┆ Total Carats │
│ ---             ┆ ---   ┆ ---    ┆ ---           ┆ ---                 ┆ ---          │
│ str             ┆ str   ┆ str    ┆ u32           ┆ f64                 ┆ f64          │
╞═════════════════╪═══════╪════════╪═══════════════╪═════════════════════╪══════════════╡
│ Ideal           ┆ D     ┆ AGSL   ┆ 23            ┆ 22365.826087        ┆ 32.02        │
│ Signature-Ideal ┆ D     ┆ GIA    ┆ 10            ┆ 21750.9             ┆ 12.66        │
│ Signature-Ideal ┆ D     ┆ AGSL   ┆ 20            ┆ 18859.2             ┆ 26.41        │
│ Signature-Ideal ┆ F     ┆ AGSL   ┆ 17            ┆ 18712.0             ┆ 23.81        │
│ Ideal           ┆ D     ┆ GIA    ┆

  pl.count().alias("Diamond Count"),


In [24]:
#9 import polars as pl

# Load your diamonds dataset
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")

# Group by 'Cut' and 'Color' and aggregate the required metrics
cut_color_pricing = (
    df.group_by(["Cut", "Color"])
    .agg([
        pl.count().alias("Diamond Count"),
        pl.col("Price").mean().alias("Avg Price per Carat"),
        pl.col("Carat Weight").sum().alias("Total Carats")
    ])
    .sort("Avg Price per Carat", descending=True)
)

# Display the results
print(cut_color_pricing)

shape: (30, 5)
┌─────────────────┬───────┬───────────────┬─────────────────────┬──────────────┐
│ Cut             ┆ Color ┆ Diamond Count ┆ Avg Price per Carat ┆ Total Carats │
│ ---             ┆ ---   ┆ ---           ┆ ---                 ┆ ---          │
│ str             ┆ str   ┆ u32           ┆ f64                 ┆ f64          │
╞═════════════════╪═══════╪═══════════════╪═════════════════════╪══════════════╡
│ Signature-Ideal ┆ D     ┆ 30            ┆ 19823.1             ┆ 39.07        │
│ Ideal           ┆ D     ┆ 280           ┆ 18461.953571        ┆ 372.84       │
│ Ideal           ┆ F     ┆ 363           ┆ 14729.426997        ┆ 502.01       │
│ Ideal           ┆ G     ┆ 690           ┆ 13570.310145        ┆ 966.9        │
│ Signature-Ideal ┆ F     ┆ 38            ┆ 13247.947368        ┆ 45.63        │
│ …               ┆ …     ┆ …             ┆ …                   ┆ …            │
│ Fair            ┆ F     ┆ 24            ┆ 6063.625            ┆ 24.2         │
│ Fair       

In [27]:
#10 Certainly! Let's address the issues you've encountered with the code for Question 10.

# Load your diamonds dataset
df = pl.read_csv("https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv")

# Calculate total value for each segment
df = df.with_columns(
    (pl.col("Price") * pl.col("Carat Weight")).alias("Total Value")
)

# Group by 'Color' and 'Cut' and aggregate the required metrics
top_segments = (
    df.group_by(["Color", "Cut"])
    .agg([
        pl.len().alias("Diamond Count"),
        pl.col("Total Value").sum().alias("Total Value"),
        pl.col("Carat Weight").sum().alias("Total Carats")
    ])
    .sort("Total Value", descending=True)
    .limit(5)
)

# Display the top 5 segments
print(top_segments)

shape: (5, 5)
┌───────┬───────────┬───────────────┬─────────────┬──────────────┐
│ Color ┆ Cut       ┆ Diamond Count ┆ Total Value ┆ Total Carats │
│ ---   ┆ ---       ┆ ---           ┆ ---         ┆ ---          │
│ str   ┆ str       ┆ u32           ┆ f64         ┆ f64          │
╞═══════╪═══════════╪═══════════════╪═════════════╪══════════════╡
│ G     ┆ Ideal     ┆ 690           ┆ 1.6031e7    ┆ 966.9        │
│ G     ┆ Very Good ┆ 578           ┆ 1.2335e7    ┆ 795.4        │
│ F     ┆ Very Good ┆ 455           ┆ 9.4719e6    ┆ 592.77       │
│ D     ┆ Ideal     ┆ 280           ┆ 9.4040e6    ┆ 372.84       │
│ H     ┆ Ideal     ┆ 458           ┆ 9.2436e6    ┆ 653.15       │
└───────┴───────────┴───────────────┴─────────────┴──────────────┘
