In [1]:
import polars as pl

In [4]:
data = {
'A': [1, 2, 3],
'B': ['foo', 'bar', 'baz'],
'C': [True, False, True]
}
df = pl.DataFrame(data)
display(df)

A,B,C
i64,str,bool
1,"""foo""",True
2,"""bar""",False
3,"""baz""",True


In [6]:
data = [
(1, 'foo', True),
(2, 'bar', False),
(3, 'baz', True)
]
df = pl.DataFrame(data, schema=['A', 'B', 'C'], orient='row')
print(df)

shape: (3, 3)
┌─────┬─────┬───────┐
│ A   ┆ B   ┆ C     │
│ --- ┆ --- ┆ ---   │
│ i64 ┆ str ┆ bool  │
╞═════╪═════╪═══════╡
│ 1   ┆ foo ┆ true  │
│ 2   ┆ bar ┆ false │
│ 3   ┆ baz ┆ true  │
└─────┴─────┴───────┘


In [7]:
import numpy as np

In [9]:
strings_data = ["apple", "banana", "cherry", "watermelon", "elderberry"]
integers_data = np.random.randint(1, 100, size=5)
floats_data = np.random.rand(5)
df = pl.DataFrame({
"fruit": strings_data,
"quantity": integers_data,
"price": floats_data
})
print(df.head(3))
print(df.dtypes)
print(df.schema)
print(df.describe())
print(df.schema)

shape: (3, 3)
┌────────┬──────────┬──────────┐
│ fruit  ┆ quantity ┆ price    │
│ ---    ┆ ---      ┆ ---      │
│ str    ┆ i32      ┆ f64      │
╞════════╪══════════╪══════════╡
│ apple  ┆ 59       ┆ 0.625647 │
│ banana ┆ 82       ┆ 0.525618 │
│ cherry ┆ 32       ┆ 0.865054 │
└────────┴──────────┴──────────┘
[String, Int32, Float64]
Schema({'fruit': String, 'quantity': Int32, 'price': Float64})
shape: (9, 4)
┌────────────┬────────────┬───────────┬──────────┐
│ statistic  ┆ fruit      ┆ quantity  ┆ price    │
│ ---        ┆ ---        ┆ ---       ┆ ---      │
│ str        ┆ str        ┆ f64       ┆ f64      │
╞════════════╪════════════╪═══════════╪══════════╡
│ count      ┆ 5          ┆ 5.0       ┆ 5.0      │
│ null_count ┆ 0          ┆ 0.0       ┆ 0.0      │
│ mean       ┆ null       ┆ 66.0      ┆ 0.587148 │
│ std        ┆ null       ┆ 24.423349 ┆ 0.322717 │
│ min        ┆ apple      ┆ 32.0      ┆ 0.071376 │
│ 25%        ┆ null       ┆ 59.0      ┆ 0.525618 │
│ 50%        ┆ null       

In [10]:
df[0,:] #первая строка, все колонки

fruit,quantity,price
str,i32,f64
"""apple""",59,0.625647


In [11]:
df[:,['fruit', "price"]] #все строки по колонкам 'fruit', "price"

fruit,price
str,f64
"""apple""",0.625647
"""banana""",0.525618
"""cherry""",0.865054
"""watermelon""",0.071376
"""elderberry""",0.848042


In [12]:
df.filter(pl.col("price") > 0.550).head() #фильтрует данные по значению в колонке

fruit,quantity,price
str,i32,f64
"""apple""",59,0.625647
"""cherry""",32,0.865054
"""elderberry""",61,0.848042


In [13]:
df.select(['fruit', 'price'])

fruit,price
str,f64
"""apple""",0.625647
"""banana""",0.525618
"""cherry""",0.865054
"""watermelon""",0.071376
"""elderberry""",0.848042


In [14]:
#добавление новой колонки
df.with_columns([
(pl.col("price") * pl.col("quantity")).alias("total")
])


fruit,quantity,price,total
str,i32,f64,f64
"""apple""",59,0.625647,36.91317
"""banana""",82,0.525618,43.100694
"""cherry""",32,0.865054,27.681725
"""watermelon""",96,0.071376,6.852137
"""elderberry""",61,0.848042,51.730587


In [17]:
#добавление новой колонки и указываем условия вывода
df.with_columns([
(pl.col("price") * pl.col("quantity")).alias("total")
]).filter(pl.col.total > 20).select(['fruit', 'total'])

fruit,total
str,f64
"""apple""",36.91317
"""banana""",43.100694
"""cherry""",27.681725
"""elderberry""",51.730587


In [None]:
.str.contains("Sales")) #для неточного поиска по строковым данным

In [21]:
import seaborn as sns

In [22]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [24]:
dfp = pl.from_pandas(df)
dfp.select(['class','sex','age']).head(3)

class,sex,age
cat,str,f64
"""Third""","""male""",22.0
"""First""","""female""",38.0
"""Third""","""female""",26.0


In [30]:
dfp.filter((pl.col('class') == 'First') & (pl.col.fare > 100))

survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
i64,i64,str,f64,i64,i64,f64,str,cat,str,bool,cat,str,str,bool
0,1,"""male""",19.0,3,2,263.0,"""S""","""First""","""man""",true,"""C""","""Southampton""","""no""",false
1,1,"""female""",,1,0,146.5208,"""C""","""First""","""woman""",false,"""B""","""Cherbourg""","""yes""",false
1,1,"""female""",23.0,3,2,263.0,"""S""","""First""","""woman""",false,"""C""","""Southampton""","""yes""",false
0,1,"""male""",24.0,0,1,247.5208,"""C""","""First""","""man""",true,"""B""","""Cherbourg""","""no""",false
1,1,"""female""",58.0,0,0,146.5208,"""C""","""First""","""woman""",false,"""B""","""Cherbourg""","""yes""",true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1,1,"""female""",21.0,2,2,262.375,"""C""","""First""","""woman""",false,"""B""","""Cherbourg""","""yes""",false
1,1,"""female""",36.0,1,2,120.0,"""S""","""First""","""woman""",false,"""B""","""Southampton""","""yes""",false
1,1,"""female""",43.0,0,1,211.3375,"""S""","""First""","""woman""",false,"""B""","""Southampton""","""yes""",false
1,1,"""male""",11.0,1,2,120.0,"""S""","""First""","""child""",false,"""B""","""Southampton""","""yes""",false


In [31]:
dfp.filter((pl.col.pclass == 1) & (pl.col.fare > 100))

survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
i64,i64,str,f64,i64,i64,f64,str,cat,str,bool,cat,str,str,bool
0,1,"""male""",19.0,3,2,263.0,"""S""","""First""","""man""",true,"""C""","""Southampton""","""no""",false
1,1,"""female""",,1,0,146.5208,"""C""","""First""","""woman""",false,"""B""","""Cherbourg""","""yes""",false
1,1,"""female""",23.0,3,2,263.0,"""S""","""First""","""woman""",false,"""C""","""Southampton""","""yes""",false
0,1,"""male""",24.0,0,1,247.5208,"""C""","""First""","""man""",true,"""B""","""Cherbourg""","""no""",false
1,1,"""female""",58.0,0,0,146.5208,"""C""","""First""","""woman""",false,"""B""","""Cherbourg""","""yes""",true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1,1,"""female""",21.0,2,2,262.375,"""C""","""First""","""woman""",false,"""B""","""Cherbourg""","""yes""",false
1,1,"""female""",36.0,1,2,120.0,"""S""","""First""","""woman""",false,"""B""","""Southampton""","""yes""",false
1,1,"""female""",43.0,0,1,211.3375,"""S""","""First""","""woman""",false,"""B""","""Southampton""","""yes""",false
1,1,"""male""",11.0,1,2,120.0,"""S""","""First""","""child""",false,"""B""","""Southampton""","""yes""",false


In [33]:
dfp = dfp.with_columns((pl.col.fare * 1.2).alias('Fareusd'))
dfp

survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,Fareusd
i64,i64,str,f64,i64,i64,f64,str,cat,str,bool,cat,str,str,bool,f64
0,3,"""male""",22.0,1,0,7.25,"""S""","""Third""","""man""",true,,"""Southampton""","""no""",false,8.7
1,1,"""female""",38.0,1,0,71.2833,"""C""","""First""","""woman""",false,"""C""","""Cherbourg""","""yes""",false,85.53996
1,3,"""female""",26.0,0,0,7.925,"""S""","""Third""","""woman""",false,,"""Southampton""","""yes""",true,9.51
1,1,"""female""",35.0,1,0,53.1,"""S""","""First""","""woman""",false,"""C""","""Southampton""","""yes""",false,63.72
0,3,"""male""",35.0,0,0,8.05,"""S""","""Third""","""man""",true,,"""Southampton""","""no""",true,9.66
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0,2,"""male""",27.0,0,0,13.0,"""S""","""Second""","""man""",true,,"""Southampton""","""no""",true,15.6
1,1,"""female""",19.0,0,0,30.0,"""S""","""First""","""woman""",false,"""B""","""Southampton""","""yes""",true,36.0
0,3,"""female""",,1,2,23.45,"""S""","""Third""","""woman""",false,,"""Southampton""","""no""",false,28.14
1,1,"""male""",26.0,0,0,30.0,"""C""","""First""","""man""",true,"""C""","""Cherbourg""","""yes""",true,36.0


In [39]:
dfp.filter((pl.col.survived == 1) & (pl.col.sex == 'female') & (pl.col.age < 18))

survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,Fareusd
i64,i64,str,f64,i64,i64,f64,str,cat,str,bool,cat,str,str,bool,f64
1,2,"""female""",14.0,1,0,30.0708,"""C""","""Second""","""child""",false,,"""Cherbourg""","""yes""",false,36.08496
1,3,"""female""",4.0,1,1,16.7,"""S""","""Third""","""child""",false,"""G""","""Southampton""","""yes""",false,20.04
1,3,"""female""",15.0,0,0,8.0292,"""Q""","""Third""","""child""",false,,"""Queenstown""","""yes""",true,9.63504
1,3,"""female""",14.0,1,0,11.2417,"""C""","""Third""","""child""",false,,"""Cherbourg""","""yes""",false,13.49004
1,2,"""female""",3.0,1,2,41.5792,"""C""","""Second""","""child""",false,,"""Cherbourg""","""yes""",false,49.89504
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1,3,"""female""",13.0,0,0,7.2292,"""C""","""Third""","""child""",false,,"""Cherbourg""","""yes""",true,8.67504
1,1,"""female""",17.0,1,0,57.0,"""S""","""First""","""woman""",false,"""B""","""Southampton""","""yes""",false,68.4
1,3,"""female""",15.0,1,0,14.4542,"""C""","""Third""","""child""",false,,"""Cherbourg""","""yes""",false,17.34504
1,1,"""female""",16.0,0,1,39.4,"""S""","""First""","""woman""",false,"""D""","""Southampton""","""yes""",false,47.28


In [45]:
dfp = dfp.with_columns((pl.col.sibsp + pl.col.parch).alias("Relateves"))
# dfp = dfp.drop("Relateves")
dfp

survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,Fareusd,Relateves
i64,i64,str,f64,i64,i64,f64,str,cat,str,bool,cat,str,str,bool,f64,i64
0,3,"""male""",22.0,1,0,7.25,"""S""","""Third""","""man""",true,,"""Southampton""","""no""",false,8.7,1
1,1,"""female""",38.0,1,0,71.2833,"""C""","""First""","""woman""",false,"""C""","""Cherbourg""","""yes""",false,85.53996,1
1,3,"""female""",26.0,0,0,7.925,"""S""","""Third""","""woman""",false,,"""Southampton""","""yes""",true,9.51,0
1,1,"""female""",35.0,1,0,53.1,"""S""","""First""","""woman""",false,"""C""","""Southampton""","""yes""",false,63.72,1
0,3,"""male""",35.0,0,0,8.05,"""S""","""Third""","""man""",true,,"""Southampton""","""no""",true,9.66,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0,2,"""male""",27.0,0,0,13.0,"""S""","""Second""","""man""",true,,"""Southampton""","""no""",true,15.6,0
1,1,"""female""",19.0,0,0,30.0,"""S""","""First""","""woman""",false,"""B""","""Southampton""","""yes""",true,36.0,0
0,3,"""female""",,1,2,23.45,"""S""","""Third""","""woman""",false,,"""Southampton""","""no""",false,28.14,3
1,1,"""male""",26.0,0,0,30.0,"""C""","""First""","""man""",true,"""C""","""Cherbourg""","""yes""",true,36.0,0


In [49]:
dfp.filter((pl.col('sibsp') > 0) & (pl.col('survived') > 0))
dfp

survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,Fareusd,Relateves
i64,i64,str,f64,i64,i64,f64,str,cat,str,bool,cat,str,str,bool,f64,i64
0,3,"""male""",22.0,1,0,7.25,"""S""","""Third""","""man""",true,,"""Southampton""","""no""",false,8.7,1
1,1,"""female""",38.0,1,0,71.2833,"""C""","""First""","""woman""",false,"""C""","""Cherbourg""","""yes""",false,85.53996,1
1,3,"""female""",26.0,0,0,7.925,"""S""","""Third""","""woman""",false,,"""Southampton""","""yes""",true,9.51,0
1,1,"""female""",35.0,1,0,53.1,"""S""","""First""","""woman""",false,"""C""","""Southampton""","""yes""",false,63.72,1
0,3,"""male""",35.0,0,0,8.05,"""S""","""Third""","""man""",true,,"""Southampton""","""no""",true,9.66,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0,2,"""male""",27.0,0,0,13.0,"""S""","""Second""","""man""",true,,"""Southampton""","""no""",true,15.6,0
1,1,"""female""",19.0,0,0,30.0,"""S""","""First""","""woman""",false,"""B""","""Southampton""","""yes""",true,36.0,0
0,3,"""female""",,1,2,23.45,"""S""","""Third""","""woman""",false,,"""Southampton""","""no""",false,28.14,3
1,1,"""male""",26.0,0,0,30.0,"""C""","""First""","""man""",true,"""C""","""Cherbourg""","""yes""",true,36.0,0


In [85]:
df=pl.read_csv('books_data.csv', null_values='NaN')
print(df.head())
print(df.describe())

shape: (5, 3)
┌─────────────┬────────┬──────────────┐
│ price       ┆ year   ┆ type         │
│ ---         ┆ ---    ┆ ---          │
│ f64         ┆ f64    ┆ str          │
╞═════════════╪════════╪══════════════╡
│ 707.529256  ┆ 2018.0 ┆ encyclopedia │
│ 1025.203348 ┆ 1992.0 ┆ textbook     │
│ 568.548657  ┆ 1990.0 ┆ encyclopedia │
│ 895.109864  ┆ 1995.0 ┆ newspaper    │
│ 206.532754  ┆ 1986.0 ┆ book         │
└─────────────┴────────┴──────────────┘
shape: (9, 4)
┌────────────┬─────────────┬─────────────┬──────────┐
│ statistic  ┆ price       ┆ year        ┆ type     │
│ ---        ┆ ---         ┆ ---         ┆ ---      │
│ str        ┆ f64         ┆ f64         ┆ str      │
╞════════════╪═════════════╪═════════════╪══════════╡
│ count      ┆ 50000.0     ┆ 48749.0     ┆ 50000    │
│ null_count ┆ 0.0         ┆ 1251.0      ┆ 0        │
│ mean       ┆ 1003.51169  ┆ 2004.470553 ┆ null     │
│ std        ┆ 1000.386795 ┆ 11.52278    ┆ null     │
│ min        ┆ 0.007358    ┆ 1985.0      ┆ boo

In [None]:
df = df.drop_nulls() # Отбрасываем строки с любыми пропущенными значениям

In [None]:
df = df.fill_null(0) # Замена отсутствующих значений на 0

In [55]:
df.with_columns(pl.col('year').forward_fill().alias('new_year')).describe() # Прямое заполнение недостающих значений в определенном столбце


statistic,price,year,type,new_year
str,f64,f64,str,f64
"""count""",50000.0,48749.0,"""50000""",50000.0
"""null_count""",0.0,1251.0,"""0""",0.0
"""mean""",1003.51169,2004.470553,,2004.4753
"""std""",1000.386795,11.52278,,11.518846
"""min""",0.007358,1985.0,"""book""",1985.0
"""25%""",292.38947,1994.0,,1994.0
"""50%""",697.459594,2004.0,,2004.0
"""75%""",1392.214487,2014.0,,2014.0
"""max""",9777.052347,2024.0,"""textbook""",2024.0


In [None]:
df.with_columns(pl.col('year').forward_fill()

In [59]:
df = df.with_columns(
df['year'].cast(pl.Int32) # or pl.Int64 if larger values are expected
)
df

price,year,type
f64,i32,str
707.529256,2018,"""encyclopedia"""
1025.203348,1992,"""textbook"""
568.548657,1990,"""encyclopedia"""
895.109864,1995,"""newspaper"""
206.532754,1986,"""book"""
…,…,…
1528.844126,2008,"""magazine"""
813.274714,2022,"""newspaper"""
1145.045726,1986,"""textbook"""
475.132386,2006,"""encyclopedia"""


In [60]:
#вместо такой записи лучше использовать df.describe()
df.select([
pl.col("price").count().alias("count"),
pl.col("price").null_count().alias("null_count"),
pl.col("price").mean().alias("mean"),
pl.col("price").std().alias("std_dev"),
pl.col("price").median().alias("median"),
pl.col("price").min().alias("min"),
pl.col("price").quantile(0.25).alias("25%"),
pl.col("price").quantile(0.5).alias("50%"),
pl.col("price").quantile(0.75).alias("75%"),
pl.col("price").max().alias("max"),
])

count,null_count,mean,std_dev,median,min,25%,50%,75%,max
u32,u32,f64,f64,f64,f64,f64,f64,f64,f64
50000,0,1003.51169,1000.386795,697.412055,0.007358,292.38947,697.459594,1392.214487,9777.052347


In [61]:
df.describe()

statistic,price,year,type
str,f64,f64,str
"""count""",50000.0,48749.0,"""50000"""
"""null_count""",0.0,1251.0,"""0"""
"""mean""",1003.51169,2004.470553,
"""std""",1000.386795,11.52278,
"""min""",0.007358,1985.0,"""book"""
"""25%""",292.38947,1994.0,
"""50%""",697.459594,2004.0,
"""75%""",1392.214487,2014.0,
"""max""",9777.052347,2024.0,"""textbook"""


In [65]:
df.group_by("type").agg(
[
pl.mean("price").alias("mean_price"),
pl.median("year").alias("median_year"),
pl.len().alias('count')
])

type,mean_price,median_year,count
str,f64,f64,u32
"""newspaper""",993.727707,2004.0,9966
"""textbook""",999.892266,2005.0,10034
"""book""",1000.189494,2005.0,9953
"""magazine""",1014.972812,2004.0,10019
"""encyclopedia""",1008.703284,2004.0,10028


In [70]:
df.group_by(["type", "year"]).agg([
pl.len()
]).pivot(
values="len",
index=["year"],
on=['type'])

year,textbook,encyclopedia,magazine,book,newspaper
i32,u32,u32,u32,u32,u32
1989,265,251,252,266,257
1995,247,235,242,222,243
2013,249,222,244,225,222
2017,252,248,255,231,243
1985,240,243,241,239,247
…,…,…,…,…,…
2004,258,256,243,213,255
2011,234,254,281,227,230
2005,233,254,224,253,233
2012,260,255,227,238,246


In [68]:
df.group_by(["type", "year"]).agg([
pl.len()
])

type,year,len
str,i32,u32
"""book""",2016,232
"""textbook""",2003,274
"""magazine""",1994,245
"""encyclopedia""",2024,215
"""book""",2018,264
…,…,…
"""magazine""",2008,257
"""magazine""",1985,241
"""magazine""",1992,242
"""textbook""",1999,244


In [71]:
df

price,year,type
f64,i32,str
707.529256,2018,"""encyclopedia"""
1025.203348,1992,"""textbook"""
568.548657,1990,"""encyclopedia"""
895.109864,1995,"""newspaper"""
206.532754,1986,"""book"""
…,…,…
1528.844126,2008,"""magazine"""
813.274714,2022,"""newspaper"""
1145.045726,1986,"""textbook"""
475.132386,2006,"""encyclopedia"""


In [73]:
#находим среднее значение цены для каждого типа продукции (оконные функции)
df.with_columns([
pl.col("price").mean().over("type").alias("mean_price_by_type")
]).head(10)

price,year,type,mean_price_by_type
f64,i32,str,f64
707.529256,2018,"""encyclopedia""",1008.703284
1025.203348,1992,"""textbook""",999.892266
568.548657,1990,"""encyclopedia""",1008.703284
895.109864,1995,"""newspaper""",993.727707
206.532754,1986,"""book""",1000.189494
3383.637351,1995,"""magazine""",1014.972812
9.753627,2011,"""newspaper""",993.727707
2809.215763,1994,"""book""",1000.189494
575.332756,1991,"""encyclopedia""",1008.703284
300.534013,1990,"""textbook""",999.892266


In [76]:
df.select(
pl.col("year").n_unique().alias("unique"),
pl.approx_n_unique("year").alias("unique_approx"), #приблизительный подсчет кол-ва
)

unique,unique_approx
u32,u32
41,41


In [None]:
df.filter(~(pl.col("category") == "High")) # "тильда" означает отрицание, так же как и "!="
df.filter((pl.col("category") != "High"))

In [77]:
def this_mil(year):
    return year > 2000

df.filter(this_mil(pl.col('year'))) # аналог WHERE в SQL для сложных условий

price,year,type
f64,i32,str
707.529256,2018,"""encyclopedia"""
9.753627,2011,"""newspaper"""
541.135894,2009,"""magazine"""
312.145612,2020,"""book"""
899.770155,2017,"""book"""
…,…,…
2178.280054,2003,"""encyclopedia"""
1528.844126,2008,"""magazine"""
813.274714,2022,"""newspaper"""
475.132386,2006,"""encyclopedia"""


In [79]:
df.select(
pl.col("year"),
pl.when(pl.col("year") > 2014)
.then(pl.lit(True))
.otherwise(pl.lit(False)) #lit нужна, чтобы отправить булевское значение
.alias("conditional"),
)

year,conditional
i32,bool
2018,true
1992,false
1990,false
1995,false
1986,false
…,…
2008,false
2022,true
1986,false
2006,false


In [86]:
df.with_columns([
pl.col("price").sum().over("type").alias("sum_price_by_type")
]).head(10)

price,year,type,sum_price_by_type
f64,f64,str,f64
707.529256,2018.0,"""encyclopedia""",10115000.0
1025.203348,1992.0,"""textbook""",10033000.0
568.548657,1990.0,"""encyclopedia""",10115000.0
895.109864,1995.0,"""newspaper""",9903500.0
206.532754,1986.0,"""book""",9954900.0
3383.637351,1995.0,"""magazine""",10169000.0
9.753627,2011.0,"""newspaper""",9903500.0
2809.215763,1994.0,"""book""",9954900.0
575.332756,1991.0,"""encyclopedia""",10115000.0
300.534013,1990.0,"""textbook""",10033000.0


In [None]:
Practice

In [80]:
data = {'building_type': ['A', 'B', 'C', 'C', 'A'],
        'sqft': [1000, 900, 400, 500, 800],
        'year': [2009, 2007, 2005, 2001, 2010]}
df = pl.DataFrame(data)
df

building_type,sqft,year
str,i64,i64
"""A""",1000,2009
"""B""",900,2007
"""C""",400,2005
"""C""",500,2001
"""A""",800,2010


In [83]:
df.group_by('building_type').agg([
    pl.col.sqft.mean().alias('mean_sqft'), 
    pl.col.year.median().alias('median_year'),
    pl.len()])

building_type,mean_sqft,median_year,len
str,f64,f64,u32
"""A""",900.0,2009.5,2
"""B""",900.0,2007.0,1
"""C""",450.0,2003.0,2


In [94]:
data = pl.read_csv('events.csv',
null_values='NaN')
display(data.head(3))
data.describe()

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2020-09-24 11:57:06 UTC""","""view""",1996170,2144415922528452715,"""electronics.telephone""",,31.9,1515915625519388267,"""LJuJVLEjPT"""
"""2020-09-24 11:57:26 UTC""","""view""",139905,2144415926932472027,"""computers.components.cooler""","""zalman""",17.16,1515915625519380411,"""tdicluNnRY"""
"""2020-09-24 11:57:27 UTC""","""view""",215454,2144415927158964449,,,9.81,1515915625513238515,"""4TMArHtXQy"""


statistic,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,str,f64,f64,str,str,f64,f64,str
"""count""","""885129""","""885129""",885129.0,885129.0,"""648910""","""672765""",885129.0,885129.0,"""884964"""
"""null_count""","""0""","""0""",0.0,0.0,"""236219""","""212364""",0.0,0.0,"""165"""
"""mean""",,,1906600.0,2.1444e+18,,,146.328713,1.5159e+18,
"""std""",,,1458700.0,616510000000000.0,,,296.807683,35549000.0,
"""min""","""2020-09-24 11:57:06 UTC""","""cart""",102.0,2.1444e+18,"""accessories.bag""","""a-data""",0.22,1.5159e+18,"""000AMhYaQu"""
"""25%""",,,698803.0,2.1444e+18,,,26.46,1.5159e+18,
"""50%""",,,1452883.0,2.1444e+18,,,65.71,1.5159e+18,
"""75%""",,,3721194.0,2.1444e+18,,,190.49,1.5159e+18,
"""max""","""2021-02-28 23:59:09 UTC""","""view""",4183880.0,2.2278e+18,"""stationery.stapler""","""zyxel""",64771.06,1.5159e+18,"""zzzYMiLcf7"""


In [99]:
data.with_columns(pl.col.event_time.str.to_datetime('%Y-%m-%d %H:%M:%S UTC')).describe()\
.select('event_time')



event_time
str
"""885129"""
"""0"""
"""2020-12-14 11:05:10.680594"""
""
"""2020-09-24 11:57:06"""
"""2020-11-05 20:48:22"""
"""2020-12-14 15:34:14"""
"""2021-01-23 07:16:12"""
"""2021-02-28 23:59:09"""


In [100]:
data.select(pl.col.brand.n_unique(), pl.col.brand.approx_n_unique().alias('Brand_approx'))

brand,Brand_approx
u32,u32
1000,1003


In [103]:
df  = data.with_columns(pl.col.brand.fill_null('zyxel'))

In [104]:
df.select(pl.col.brand.n_unique(), pl.col.brand.approx_n_unique().alias('Brand_approx'))

brand,Brand_approx
u32,u32
999,1002
