## PACKAGE POLARS

https://pola-rs.github.io/polars-book/user-guide/quickstart/quick-exploration-guide.html

Date : 07-03-23

In [2]:
# %pip install polars

In [2]:
import polars as pl
from datetime import datetime, timedelta 
import numpy as np

ModuleNotFoundError: No module named 'polars'

### <font color='yellow'>Création d'objets

#### En partant de zéro

##### D'une Series

In [23]:
# Création d'une Series à partir d'un tuple
series = pl.Series("a", [1, 2, 3, 4, 5])
series

a
i64
1
2
3
4
5


In [21]:
# Création d'une séries à partir d'une liste
series = pl.Series([1, 2, 3, 4, 5])
series

1
2
3
4
5


##### D'une DF

In [22]:
# Création d'une DF
dataframe = pl.DataFrame({"integer": [1, 2, 3], 
                          "date": [
                              (datetime(2022, 1, 1)), 
                              (datetime(2022, 1, 2)), 
                              (datetime(2022, 1, 3))
                          ], 
                          "float":[4.0, 5.0, 6.0]})
dataframe

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0


#### À partir de fichiers

##### CSV

In [25]:
# Conversion de la DF en format .csv
dataframe.write_csv('data/output.csv')

In [26]:
# Récupération du fichier .csv
df_csv = pl.read_csv('data/output.csv')
df_csv

integer,date,float
i64,str,f64
1,"""2022-01-01T00:...",4.0
2,"""2022-01-02T00:...",5.0
3,"""2022-01-03T00:...",6.0


In [28]:
# Conversion auto de certains champs au format date
df_csv_with_dates = pl.read_csv('data/output.csv', parse_dates=True)

print(df_csv_with_dates)

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


##### JSON

In [29]:
# Conversion de la DF sous format JSON
dataframe.write_json('data/output.json')

In [31]:
# Récupération du fichier JSON
df_json = pl.read_json('data/output.json')

print(df_json)

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


##### Parquet
Un fichier parquet est l'équivalent d'un fichier .csv, mais prend beaucoup moins de mémoire -> à l'avenir les fichiers .csv vont être remplacés par ces types de fichiers.
Ces fichiers ont été créés entre autres, par le concepteur de Pandas

In [32]:
# Conversion de la DF sous format Parquet
dataframe.write_parquet('data/output.parquet')

In [33]:
# Récupération du fichier parquet
df_parquet = pl.read_parquet('data/output.parquet')
print(df_parquet)

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


### <font color='yellow'>Affichage des données

In [34]:
# Nouvelle DF
df = pl.DataFrame({"a": np.arange(0, 8), 
                   "b": np.random.rand(8), 
                   "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
                   "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None]
                  })

print(df)

shape: (8, 4)
┌─────┬──────────┬─────────────────────┬───────┐
│ a   ┆ b        ┆ c                   ┆ d     │
│ --- ┆ ---      ┆ ---                 ┆ ---   │
│ i32 ┆ f64      ┆ datetime[μs]        ┆ f64   │
╞═════╪══════════╪═════════════════════╪═══════╡
│ 0   ┆ 0.054095 ┆ 2022-12-01 00:00:00 ┆ 1.0   │
│ 1   ┆ 0.58278  ┆ 2022-12-02 00:00:00 ┆ 2.0   │
│ 2   ┆ 0.967882 ┆ 2022-12-03 00:00:00 ┆ NaN   │
│ 3   ┆ 0.853834 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 4   ┆ 0.9598   ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 5   ┆ 0.270633 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 6   ┆ 0.499073 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 7   ┆ 0.239617 ┆ 2022-12-08 00:00:00 ┆ null  │
└─────┴──────────┴─────────────────────┴───────┘


In [37]:
# Affichage des 5 premières lignes
df.head()

a,b,c,d
i32,f64,datetime[μs],f64
0,0.054095,2022-12-01 00:00:00,1.0
1,0.58278,2022-12-02 00:00:00,2.0
2,0.967882,2022-12-03 00:00:00,
3,0.853834,2022-12-04 00:00:00,
4,0.9598,2022-12-05 00:00:00,0.0


In [36]:
# Affichage des 5 dernières lignes
df.tail()

a,b,c,d
i32,f64,datetime[μs],f64
3,0.853834,2022-12-04 00:00:00,
4,0.9598,2022-12-05 00:00:00,0.0
5,0.270633,2022-12-06 00:00:00,-5.0
6,0.499073,2022-12-07 00:00:00,-42.0
7,0.239617,2022-12-08 00:00:00,


In [38]:
# Affichage de 3 lignes au hasard
df.sample(n=3)

a,b,c,d
i32,f64,datetime[μs],f64
4,0.9598,2022-12-05 00:00:00,0.0
1,0.58278,2022-12-02 00:00:00,2.0
3,0.853834,2022-12-04 00:00:00,


In [39]:
# Données statistiques de la DF
df.describe()

describe,a,b,c,d
str,f64,f64,str,f64
"""count""",8.0,8.0,"""8""",8.0
"""null_count""",0.0,0.0,"""0""",1.0
"""mean""",3.5,0.553464,,
"""std""",2.44949,0.350256,,
"""min""",0.0,0.054095,"""2022-12-01 00:...",-42.0
"""max""",7.0,0.967882,"""2022-12-08 00:...",2.0
"""median""",3.5,0.540927,,1.0


### <font color='yellow'>Expressions

#### select()

In [40]:
# Afficher toutes les colonnes
df.select(
    pl.col('*'))

a,b,c,d
i32,f64,datetime[μs],f64
0,0.054095,2022-12-01 00:00:00,1.0
1,0.58278,2022-12-02 00:00:00,2.0
2,0.967882,2022-12-03 00:00:00,
3,0.853834,2022-12-04 00:00:00,
4,0.9598,2022-12-05 00:00:00,0.0
5,0.270633,2022-12-06 00:00:00,-5.0
6,0.499073,2022-12-07 00:00:00,-42.0
7,0.239617,2022-12-08 00:00:00,


In [41]:
# Afficher certaines colonnes
df.select(
    pl.col(['a', 'b'])
)

a,b
i32,f64
0,0.054095
1,0.58278
2,0.967882
3,0.853834
4,0.9598
5,0.270633
6,0.499073
7,0.239617


In [42]:
# Affichage de certaines colonnes limitées à un nombre de lignes 
df.select([
    pl.col('a'),
    pl.col('b')
]).limit(3)

a,b
i32,f64
0,0.054095
1,0.58278
2,0.967882


In [43]:
# Exclusion d'une colonne à afficher
df.select([
    pl.exclude('a')
])

b,c,d
f64,datetime[μs],f64
0.054095,2022-12-01 00:00:00,1.0
0.58278,2022-12-02 00:00:00,2.0
0.967882,2022-12-03 00:00:00,
0.853834,2022-12-04 00:00:00,
0.9598,2022-12-05 00:00:00,0.0
0.270633,2022-12-06 00:00:00,-5.0
0.499073,2022-12-07 00:00:00,-42.0
0.239617,2022-12-08 00:00:00,


#### filter()

In [44]:
# Filtre entre deux dates spécifiques de la colonne 'c'
df.filter(
    pl.col("c").is_between(datetime(2022, 12, 2), datetime(2022, 12, 8)),
)

a,b,c,d
i32,f64,datetime[μs],f64
1,0.58278,2022-12-02 00:00:00,2.0
2,0.967882,2022-12-03 00:00:00,
3,0.853834,2022-12-04 00:00:00,
4,0.9598,2022-12-05 00:00:00,0.0
5,0.270633,2022-12-06 00:00:00,-5.0
6,0.499073,2022-12-07 00:00:00,-42.0
7,0.239617,2022-12-08 00:00:00,


In [45]:
# Récupération des références inférieures ou égal à 3 
# dans la colonne 'a' et non vides dans la colonne 'd'
df.filter(
    (pl.col('a') <= 3) & (pl.col('d').is_not_nan())
)

a,b,c,d
i32,f64,datetime[μs],f64
0,0.054095,2022-12-01 00:00:00,1.0
1,0.58278,2022-12-02 00:00:00,2.0


#### with_columns()

In [46]:
# Ajout de deux nouvelles colonnes :
# colonne 'e' = somme de la colonne 'b'
# colonne 'b+42' = 42 + colonne b
df.with_columns([
    pl.col('b').sum().alias('e'),
    (pl.col('b') + 42).alias('b+42')
])


a,b,c,d,e,b+42
i32,f64,datetime[μs],f64,f64,f64
0,0.054095,2022-12-01 00:00:00,1.0,4.427713,42.054095
1,0.58278,2022-12-02 00:00:00,2.0,4.427713,42.58278
2,0.967882,2022-12-03 00:00:00,,4.427713,42.967882
3,0.853834,2022-12-04 00:00:00,,4.427713,42.853834
4,0.9598,2022-12-05 00:00:00,0.0,4.427713,42.9598
5,0.270633,2022-12-06 00:00:00,-5.0,4.427713,42.270633
6,0.499073,2022-12-07 00:00:00,-42.0,4.427713,42.499073
7,0.239617,2022-12-08 00:00:00,,4.427713,42.239617


#### groupby()

In [47]:
# Nouvelle DF
df2 = pl.DataFrame({
                    "x": np.arange(0, 8), 
                    "y": ['A', 'A', 'A', 'B', 'B', 'C', 'X', 'X'],
})

print(df2)


shape: (8, 2)
┌─────┬─────┐
│ x   ┆ y   │
│ --- ┆ --- │
│ i32 ┆ str │
╞═════╪═════╡
│ 0   ┆ A   │
│ 1   ┆ A   │
│ 2   ┆ A   │
│ 3   ┆ B   │
│ 4   ┆ B   │
│ 5   ┆ C   │
│ 6   ┆ X   │
│ 7   ┆ X   │
└─────┴─────┘


In [48]:
# Nombre de composants en regroupant chaque valeur de la colonne 'y'
# avec un trie par ordre croissant
df2.groupby("y", maintain_order=True).count()

y,count
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [49]:
df2.groupby("y", maintain_order=True).agg([
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum")
])


y,count,sum
str,u32,i32
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


### <font color='yellow'>Combinaison de DF

#### join() : fusion par assemblage de colonnes

In [50]:
# Constitution de 2 DF
df = pl.DataFrame({"a": np.arange(0, 8), 
                   "b": np.random.rand(8), 
                   "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
                   "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None]
                  })

df2 = pl.DataFrame({
                    "x": np.arange(0, 8), 
                    "y": ['A', 'A', 'A', 'B', 'B', 'C', 'X', 'X'],
})


In [51]:
# Fusion avec une clé commune
df.join(df2, left_on="a", right_on="x")

a,b,c,d,y
i32,f64,datetime[μs],f64,str
0,0.897464,2022-12-01 00:00:00,1.0,"""A"""
1,0.065847,2022-12-02 00:00:00,2.0,"""A"""
2,0.99112,2022-12-03 00:00:00,,"""A"""
3,0.586549,2022-12-04 00:00:00,,"""B"""
4,0.594266,2022-12-05 00:00:00,0.0,"""B"""
5,0.115763,2022-12-06 00:00:00,-5.0,"""C"""
6,0.504195,2022-12-07 00:00:00,-42.0,"""X"""
7,0.515351,2022-12-08 00:00:00,,"""X"""


#### concat() : concaténation par ajout de lignes

In [52]:
# Situation exceptionnelle ! 
# Concaténation par colonne c'est fou 😮
pl.concat([df,df2], how="horizontal")

a,b,c,d,x,y
i32,f64,datetime[μs],f64,i32,str
0,0.897464,2022-12-01 00:00:00,1.0,0,"""A"""
1,0.065847,2022-12-02 00:00:00,2.0,1,"""A"""
2,0.99112,2022-12-03 00:00:00,,2,"""A"""
3,0.586549,2022-12-04 00:00:00,,3,"""B"""
4,0.594266,2022-12-05 00:00:00,0.0,4,"""B"""
5,0.115763,2022-12-06 00:00:00,-5.0,5,"""C"""
6,0.504195,2022-12-07 00:00:00,-42.0,6,"""X"""
7,0.515351,2022-12-08 00:00:00,,7,"""X"""
