## **Librerías**

In [1]:
import polars as pl

In [2]:
import datetime as dt

## **Introducción**

### **Creación de DataFrames**

In [119]:
# Crear un DataFrame
df = pl.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Dianne'],
    'Birthdate': [
        dt.date(1978, 1, 30),
        dt.date(1998, 7, 3),
        dt.date(1967, 12, 14),
        dt.date(1965, 10, 13),
    ],
    'Gender': ['F', 'M', 'M', 'F'],
    'Weight': [57.9, 72.5, 53.6, 83.1],
    'Height': [1.65, 1.80, 1.70, 1.75],
    'Age': [28, 27, 26, 25] 
})

df

Name,Birthdate,Gender,Weight,Height,Age
str,date,str,f64,f64,i64
"""Alice""",1978-01-30,"""F""",57.9,1.65,28
"""Bob""",1998-07-03,"""M""",72.5,1.8,27
"""Charlie""",1967-12-14,"""M""",53.6,1.7,26
"""Dianne""",1965-10-13,"""F""",83.1,1.75,25


In [None]:
# Mostrar el primer registro del DataFrame
df.head(1)

Name,Birthdate,Weight,Age
str,date,f64,i64
"""Alice""",1997-01-30,57.9,28


In [None]:
# Mostrar el último registro del DataFrame
df.tail(1)

Name,Birthdate,Weight,Age
str,date,f64,i64
"""Dianne""",1994-10-13,83.1,25


In [None]:
# Mostrar un registro aleatorio del DataFrame
df.sample(1)

Name,Birthdate,Weight,Age
str,date,f64,i64
"""Bob""",1998-07-03,72.5,27


In [8]:
# Ver las dimenriones del DataFrame
df.shape

(4, 4)

In [None]:
# Ver las columnas de nuestro DataFrame
df.columns

['Name', 'Birthdate', 'Weight', 'Height', 'Age']

In [13]:
# Ver la distribución del DataFrame
df.describe()

statistic,Name,Birthdate,Weight,Height,Age
str,str,str,f64,f64,f64
"""count""","""4""","""4""",4.0,4.0,4.0
"""null_count""","""0""","""0""",0.0,0.0,0.0
"""mean""",,"""1997-07-30 18:00:00""",66.775,1.725,26.5
"""std""",,,13.560082,0.06455,1.290994
"""min""","""Alice""","""1994-10-13""",53.6,1.65,25.0
"""25%""",,"""1997-01-30""",57.9,1.7,26.0
"""50%""",,"""1998-07-03""",72.5,1.75,27.0
"""75%""",,"""1998-07-03""",72.5,1.75,27.0
"""max""","""Dianne""","""1999-12-14""",83.1,1.8,28.0


In [14]:
# Guardar el DataFrame
df.write_csv('../Data/Dataframe.csv')
df.write_excel('../Data/Dataframe.xlsx')
df.write_parquet('../Data/Dataframe.parquet')

In [15]:
# Leer el Dataframe
df = pl.read_parquet('../Data/Dataframe.parquet')
df

Name,Birthdate,Weight,Height,Age
str,date,f64,f64,i64
"""Alice""",1997-01-30,57.9,1.65,28
"""Bob""",1998-07-03,72.5,1.8,27
"""Charlie""",1999-12-14,53.6,1.7,26
"""Dianne""",1994-10-13,83.1,1.75,25


### **Expresiones**

#### **select**

In [61]:
# El método select que nos permite realizar operaciones complejas sobre nuestro dataframe

result = df.select(
    pl.col('Name'), # Podemos llamar directamente a las columnas
    pl.col('Birthdate').alias('Birth_Date'), # Cambiar el nombre de las columnas
    pl.col('Birthdate').dt.year().alias('Birth_Year'), # Realizar transformaciones
    pl.col('Weight', 'Height').name.suffix('_cm'), # Poner suficjos
    (pl.col('Weight') / (pl.col('Height') ** 2)).alias('BMI') # O realizar operaciones
)

result

Name,Birth_Date,Birth_Year,Weight_cm,Height_cm,BMI
str,date,i32,f64,f64,f64
"""Alice""",1997-01-30,1997,57.9,1.65,21.267218
"""Bob""",1998-07-03,1998,72.5,1.8,22.376543
"""Charlie""",1999-12-14,1999,53.6,1.7,18.546713
"""Dianne""",1994-10-13,1994,83.1,1.75,27.134694


#### **with_columns**

In [44]:
# El método with_columns nos permite añadir columnas directamente al dataframe

result = df.with_columns(
    birth_year = pl.col('Birthdate').dt.year(),
    bmi = (pl.col('Weight')) / (pl.col('Height') ** 2)
)

result

Name,Birthdate,Weight,Height,Age,birth_year,bmi
str,date,f64,f64,i64,i32,f64
"""Alice""",1997-01-30,57.9,1.65,28,1997,21.267218
"""Bob""",1998-07-03,72.5,1.8,27,1998,22.376543
"""Charlie""",1999-12-14,53.6,1.7,26,1999,18.546713
"""Dianne""",1994-10-13,83.1,1.75,25,1994,27.134694


#### **filter**

In [82]:
result = df.filter(
    (pl.col('Weight')>60) & (pl.col('Height') < 1.8)
)

result

Name,Birthdate,Weight,Height,Age
str,date,f64,f64,i64
"""Dianne""",1994-10-13,83.1,1.75,25


In [86]:
result = df.filter(
    (pl.col('Weight')>60), 
    (pl.col('Height') < 1.8)
)

result

Name,Birthdate,Weight,Height,Age
str,date,f64,f64,i64
"""Dianne""",1994-10-13,83.1,1.75,25


#### **group_by**

In [121]:
result = df.with_columns(
    Decade = pl.col('Birthdate').dt.year() // 10 * 10
)

In [123]:
# result = 
result.group_by(
    pl.col('Decade', 'Gender'),
    maintain_order=True
).len()

Decade,Gender,len
i32,str,u32
1970,"""F""",1
1990,"""M""",1
1960,"""M""",1
1960,"""F""",1


In [139]:
# result = 
result.group_by(
    pl.col('Decade'),
    maintain_order=True
).agg(
    pl.len().alias('Count'),
    pl.col('Weight', 'Height').mean().name.prefix('Avg_'),
    pl.col('Weight', 'Height').max().name.prefix('Max_')
)

Decade,Count,Avg_Weight,Avg_Height,Max_Weight,Max_Height
i32,u32,f64,f64,f64,f64
1970,1,57.9,1.65,57.9,1.65
1990,1,72.5,1.8,72.5,1.8
1960,2,68.35,1.725,83.1,1.75
