In [11]:
!pip install polars
import polars as pl

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

or by setting POLARS_ALLOW_FORKING_THREAD=1.

  pid, fd = os.forkpty()




# Create DataFrame 

In [12]:
data = [{"fruit": "apple", "count": 10, "price": 0.50}, 
        {"fruit": "banana", "count": 20, "price": 0.25},
        {"fruit": "apple", "count": 15, "price": 0.55}]

df = pl.from_dicts(data)
print(df)

shape: (3, 3)
┌────────┬───────┬───────┐
│ fruit  ┆ count ┆ price │
│ ---    ┆ ---   ┆ ---   │
│ str    ┆ i64   ┆ f64   │
╞════════╪═══════╪═══════╡
│ apple  ┆ 10    ┆ 0.5   │
│ banana ┆ 20    ┆ 0.25  │
│ apple  ┆ 15    ┆ 0.55  │
└────────┴───────┴───────┘


# Expressions to select, filter, aggregate

In [13]:
## Select and filter
filt = df.filter(pl.col("fruit") == "apple")
print(filt)

shape: (2, 3)
┌───────┬───────┬───────┐
│ fruit ┆ count ┆ price │
│ ---   ┆ ---   ┆ ---   │
│ str   ┆ i64   ┆ f64   │
╞═══════╪═══════╪═══════╡
│ apple ┆ 10    ┆ 0.5   │
│ apple ┆ 15    ┆ 0.55  │
└───────┴───────┴───────┘


In [14]:
##  groupby("fruit"): Groups the DataFrame by the fruit column.
##	.agg(): Specifies the aggregation operation.
##	pl.col("count").sum(): Calculates the sum of the count column.
##	.alias("total_count"): Renames the aggregated column.

In [15]:
## Group and aggregate
agg = filt.group_by("fruit").agg(pl.col("count").sum().alias("total_count"))
print(agg)

shape: (1, 2)
┌───────┬─────────────┐
│ fruit ┆ total_count │
│ ---   ┆ ---         │
│ str   ┆ i64         │
╞═══════╪═════════════╡
│ apple ┆ 25          │
└───────┴─────────────┘


In [16]:
import pandas as pd
import numpy as np

# Create small dummy DataFrame
np.random.seed(1)
fruits = ['apple', 'banana', 'strawberry', 'kiwi']  
N = 1000


In [17]:
df = pd.DataFrame({
   'fruit': np.random.choice(fruits, N),
   'price': np.random.uniform(1, 10, N)   
})
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   fruit   1000 non-null   object 
 1   price   1000 non-null   float64
dtypes: float64(1), object(1)
memory usage: 15.8+ KB


Unnamed: 0,fruit,price
0,banana,1.787340
1,kiwi,3.045788
2,apple,3.829390
3,apple,2.572893
4,kiwi,6.463847
...,...,...
995,apple,5.444422
996,apple,9.675260
997,strawberry,6.804475
998,strawberry,5.358842


In [18]:
# Function to process DataFrame
def get_prices_by_fruit(df):
    return df.groupby('fruit')['price'].agg(['count', 'mean', 'min', 'max'])

In [19]:
prices = get_prices_by_fruit(df) 
print(prices)

            count      mean       min       max
fruit                                          
apple         265  5.610179  1.047159  9.986685
banana        241  5.625379  1.071396  9.934308
kiwi          237  5.420487  1.006879  9.941144
strawberry    257  5.358464  1.028016  9.922119
