In [2]:
import polars as pl

# Load the diamonds dataset
print("Loading diamonds dataset...")
df = pl.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/diamond.csv')
print("Dataset loaded successfully!")

Loading diamonds dataset...
Dataset loaded successfully!


In [3]:
#q1
df.height

6000

In [4]:
#q2
col_names = df.columns
col_names

['Carat Weight',
 'Cut',
 'Color',
 'Clarity',
 'Polish',
 'Symmetry',
 'Report',
 'Price']

In [5]:
# q3
# each row represents the characteristics of a diamond (grain = 1 row is 1 diamond) -  such as its carat weight, color, cut, price etc
df.head()

Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
f64,str,str,str,str,str,str,i64
1.1,"""Ideal""","""H""","""SI1""","""VG""","""EX""","""GIA""",5169
0.83,"""Ideal""","""H""","""VS1""","""ID""","""ID""","""AGSL""",3470
0.85,"""Ideal""","""H""","""SI1""","""EX""","""EX""","""GIA""",3183
0.91,"""Ideal""","""E""","""SI1""","""VG""","""VG""","""GIA""",4370
0.83,"""Ideal""","""G""","""SI1""","""EX""","""EX""","""GIA""",3171


In [6]:
#q4
import polars as pl

carat_count = df.select(pl.col('Carat Weight').sum()).item()
carat_count

8007.120000000001

In [7]:
#q5
cut_variety = df.select(pl.col('Cut').unique())
cut_variety_count = df.select(pl.col('Cut').n_unique()).item()
print(cut_variety)
print(f'Total unique varieties of cut = {cut_variety_count}')

shape: (5, 1)
┌─────────────────┐
│ Cut             │
│ ---             │
│ str             │
╞═════════════════╡
│ Signature-Ideal │
│ Fair            │
│ Ideal           │
│ Good            │
│ Very Good       │
└─────────────────┘
Total unique varieties of cut = 5


In [8]:
#q6
most_expensive_rock = df.select(pl.col('Price').max()).item()
cheapests_rock = df.select(pl.col('Price').min()).item()
print(f'The costliest diamond costs ${most_expensive_rock:,.2f}')
print(f'and the cheapest diamond costs ${cheapests_rock:,.2f}')

The costliest diamond costs $101,561.00
and the cheapest diamond costs $2,184.00


In [9]:
#q7
price_by_color = df.group_by('Color').agg(pl.col('Price').mean())
price_by_color

# new grain = 1 row is 1 color
#now each row represents the avg price for all diamonds of a particular color

Color,Price
str,f64
"""G""",12520.050633
"""D""",15255.783661
"""E""",11539.190231
"""F""",12712.241856
"""H""",10487.347544
"""I""",8989.636364


In [14]:
#q8
clarity_based = df.group_by('Clarity').agg(
    pl.col('Clarity').count().alias('count'),
    pl.col('Price').mean().alias('avg_Price'),
     pl.col('Carat Weight').sum().alias('total_carat_weight'))

clarity_based

Clarity,count,avg_Price,total_carat_weight
str,u32,f64,f64
"""IF""",219,22105.844749,316.0
"""SI1""",2059,8018.864012,2563.69
"""VS2""",1575,11809.053333,2170.81
"""VVS2""",666,14142.177177,876.58
"""FL""",4,63776.0,7.87
"""VS1""",1192,13694.113255,1682.74
"""VVS1""",285,16845.680702,389.43


In [11]:
#q9
cut_color_based = df.group_by(['Cut', 'Color']).agg(pl.col('Price').mean().alias('avg_Price'))
cut_color_based

# new grain = 1 row is 1 cut-color combo
# each row represents the average price for cut-color combinations

Cut,Color,avg_Price
str,str,f64
"""Ideal""","""E""",12647.107914
"""Fair""","""G""",7345.52381
"""Good""","""E""",8969.545455
"""Signature-Ideal""","""H""",9112.688889
"""Signature-Ideal""","""G""",10248.296875
…,…,…
"""Very Good""","""I""",8930.031332
"""Very Good""","""H""",10056.106132
"""Fair""","""H""",5908.5
"""Good""","""G""",9988.614865


In [12]:
#q10
cut_color_combo = df.group_by(['Cut', 'Color']).agg(
    pl.col('Price').sum().alias('Total_Value'),
    pl.col('Cut').count().alias('Diamond_Count')
)
cut_color_combo.head().sort('Total_Value', descending=True)

Cut,Color,Total_Value,Diamond_Count
str,str,i64,u32
"""Ideal""","""I""",3906810,413
"""Very Good""","""D""",3502989,265
"""Good""","""E""",986650,110
"""Signature-Ideal""","""G""",655891,64
"""Fair""","""G""",154256,21
