# Exercices sur polars

Exercices copiés de pandas (voir dossier pandas_tuto - cours PandasAXXX)

Date : 13/02/25 <br>
Editeur : Laurent Reynaud

In [15]:
import polars as pl
import numpy as np

## DataFrame basics

### A few of the fundamental routines for selecting, sorting, adding and aggregating data in DataFrames

Difficulty: *easy*

consider the following Python dictionary `data` and Python list `labels`:

``` python
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
```
(This is just some meaningless data I made up with the theme of animals and trips to a vet.)

**4.** Create a DataFrame `df` from this dictionary `data`.

In [16]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
df = pl.DataFrame(data=data)
df

animal,age,visits,priority
str,f64,i64,str
"""cat""",2.5,1,"""yes"""
"""cat""",3.0,3,"""yes"""
"""snake""",0.5,2,"""no"""
"""dog""",,3,"""yes"""
"""dog""",5.0,2,"""no"""
"""cat""",2.0,3,"""no"""
"""snake""",4.5,1,"""no"""
"""cat""",,1,"""yes"""
"""dog""",7.0,2,"""no"""
"""dog""",3.0,1,"""no"""


**5.** Display a summary of the basic information about this DataFrame and its data (*hint: there is a single method that can be called on the DataFrame*).

In [17]:
df.describe()

statistic,animal,age,visits,priority
str,str,f64,f64,str
"""count""","""10""",10.0,10.0,"""10"""
"""null_count""","""0""",0.0,0.0,"""0"""
"""mean""",,,1.9,
"""std""",,,0.875595,
"""min""","""cat""",0.5,1.0,"""no"""
"""25%""",,2.5,1.0,
"""50%""",,4.5,2.0,
"""75%""",,7.0,3.0,
"""max""","""snake""",7.0,3.0,"""yes"""


**6.** Return the first 3 rows of the DataFrame `df`.

In [18]:
df.head(3)

animal,age,visits,priority
str,f64,i64,str
"""cat""",2.5,1,"""yes"""
"""cat""",3.0,3,"""yes"""
"""snake""",0.5,2,"""no"""


**7.** Select just the 'animal' and 'age' columns from the DataFrame `df`.

In [19]:
df.select('animal', 'age')

animal,age
str,f64
"""cat""",2.5
"""cat""",3.0
"""snake""",0.5
"""dog""",
"""dog""",5.0
"""cat""",2.0
"""snake""",4.5
"""cat""",
"""dog""",7.0
"""dog""",3.0


**8.** Select the data in rows `['cat']` with columns 'animal' *and* in columns `['animal', 'age']`.

In [20]:
df.filter(pl.col('animal').str.contains('cat')).select('animal', 'age')

animal,age
str,f64
"""cat""",2.5
"""cat""",3.0
"""cat""",2.0
"""cat""",


**9.** Select only the rows where the number of visits is greater than 2.

In [21]:
df.filter(pl.col('visits') > 2)

animal,age,visits,priority
str,f64,i64,str
"""cat""",3.0,3,"""yes"""
"""dog""",,3,"""yes"""
"""cat""",2.0,3,"""no"""


**10.** Select the rows where the age is missing, i.e. it is `NaN`.

In [22]:
df.filter(pl.col('age').is_nan())

animal,age,visits,priority
str,f64,i64,str
"""dog""",,3,"""yes"""
"""cat""",,1,"""yes"""


**11.** Select the rows where the animal is a cat *and* the age is less than 3.

In [23]:
df.filter((pl.col('animal').str.contains('cat')) 
          & (pl.col('age') < 3))

animal,age,visits,priority
str,f64,i64,str
"""cat""",2.5,1,"""yes"""
"""cat""",2.0,3,"""no"""


**12.** Select the rows the age is between 2 and 4 (inclusive).

In [24]:
df.filter((pl.col('age') >= 2) & (pl.col('age') <= 4))

animal,age,visits,priority
str,f64,i64,str
"""cat""",2.5,1,"""yes"""
"""cat""",3.0,3,"""yes"""
"""cat""",2.0,3,"""no"""
"""dog""",3.0,1,"""no"""


**14.** Calculate the sum of all visits in `df` (i.e. find the total number of visits).

In [25]:
df['visits'].sum()

19

**15.** Calculate the mean age for each different animal in `df`.

In [26]:
df.group_by('animal').agg(pl.col('age').mean())

animal,age
str,f64
"""snake""",2.5
"""cat""",
"""dog""",


**16.** Append a new row to `df` with your choice of values for each column. Then delete that row to return the original DataFrame.


In [27]:
# Append a new row
new_row = {'animal': 'rabbit', 'age': 2.0, 'visits': 1, 'priority': 'no'}
df = df.vstack(pl.DataFrame([new_row]))
df.tail()

animal,age,visits,priority
str,f64,i64,str
"""snake""",4.5,1,"""no"""
"""cat""",,1,"""yes"""
"""dog""",7.0,2,"""no"""
"""dog""",3.0,1,"""no"""
"""rabbit""",2.0,1,"""no"""


In [28]:
# Delete the new row to return to the original DataFrame
df = df.head(-1)
df.tail()

animal,age,visits,priority
str,f64,i64,str
"""cat""",2.0,3,"""no"""
"""snake""",4.5,1,"""no"""
"""cat""",,1,"""yes"""
"""dog""",7.0,2,"""no"""
"""dog""",3.0,1,"""no"""


**17.** Count the number of each type of animal in `df`.

In [29]:
df['animal'].value_counts()

animal,count
str,u32
"""dog""",4
"""cat""",4
"""snake""",2


**18.** Sort `df` first by the values in the 'age' in *decending* order, then by the value in the 'visits' column in *ascending* order

In [30]:
df.sort(['age', 'visits'], descending=[True, False])

animal,age,visits,priority
str,f64,i64,str
"""cat""",,1,"""yes"""
"""dog""",,3,"""yes"""
"""dog""",7.0,2,"""no"""
"""dog""",5.0,2,"""no"""
"""snake""",4.5,1,"""no"""
"""dog""",3.0,1,"""no"""
"""cat""",3.0,3,"""yes"""
"""cat""",2.5,1,"""yes"""
"""cat""",2.0,3,"""no"""
"""snake""",0.5,2,"""no"""


**19.** The 'priority' column contains the values 'yes' and 'no'. Replace this column with a column of boolean values: 'yes' should be `True` and 'no' should be `False`.

In [31]:
df.with_columns(
    pl.when(pl.col('priority') == 'yes')
    .then(True)
    .otherwise(False)
    .alias('priority'),
    )

animal,age,visits,priority
str,f64,i64,bool
"""cat""",2.5,1,True
"""cat""",3.0,3,True
"""snake""",0.5,2,False
"""dog""",,3,True
"""dog""",5.0,2,False
"""cat""",2.0,3,False
"""snake""",4.5,1,False
"""cat""",,1,True
"""dog""",7.0,2,False
"""dog""",3.0,1,False


**20.** In the 'animal' column, change the 'snake' entries to 'python'.

In [32]:
df.with_columns(
    pl.col('animal').replace("snake", "python").alias('animal')
)

animal,age,visits,priority
str,f64,i64,str
"""cat""",2.5,1,"""yes"""
"""cat""",3.0,3,"""yes"""
"""python""",0.5,2,"""no"""
"""dog""",,3,"""yes"""
"""dog""",5.0,2,"""no"""
"""cat""",2.0,3,"""no"""
"""python""",4.5,1,"""no"""
"""cat""",,1,"""yes"""
"""dog""",7.0,2,"""no"""
"""dog""",3.0,1,"""no"""


**21.** For each animal type and each number of visits, find the mean age. In other words, each row is an animal, each column is a number of visits and the values are the mean ages.

In [33]:
df.pivot(
    values='age', index='animal', columns='visits', 
    aggregate_function='mean')

  df.pivot(


animal,1,3,2
str,f64,f64,f64
"""cat""",,2.5,
"""snake""",4.5,,0.5
"""dog""",3.0,,6.0
