# DataFrame

In [30]:
import polars as pl

## Create DataFrame

Ref:

- https://docs.pola.rs/api/python/stable/reference/dataframe/index.html

In [31]:
data = {"a": [1, 2], "b": [3, 4]}
df = pl.DataFrame(data)
df

a,b
i64,i64
1,3
2,4


In [32]:
df.dtypes

[Int64, Int64]

In [33]:
df.schema

Schema([('a', Int64), ('b', Int64)])

In [34]:
# {"col1": [...], "col2": [...]}
# this is most efficient because polars is column oriented
data = {"col1": [0, 2], "col2": [3, 7]}
df = pl.DataFrame(data, schema={"col1": pl.Float32, "col2": pl.Int64})
df

col1,col2
f32,i64
0.0,3
2.0,7


In [35]:
data = {"col1": [1, 2], "col2": [3, 4]}
df = pl.DataFrame(data, schema=[("col1", pl.Float32), ("col2", pl.Int64)])
df

col1,col2
f32,i64
1.0,3
2.0,4


In [36]:
data = {"col1": [0, 2], "col2": [3, 7]}
df = pl.DataFrame(data, schema={"col1": float, "col2": int})
df

col1,col2
f64,i64
0.0,3
2.0,7


In [37]:
data = {"col1": [1, 2], "col2": [3, 4]}
df = pl.DataFrame(data, schema=[("col1", float), ("col2", int)])
df

col1,col2
f64,i64
1.0,3
2.0,4


In [38]:
data = [[1, "Alice"], [2, "Bob"]]
df = pl.DataFrame(data, schema=["id", "name"], orient="row")
df

id,name
i64,str
1,"""Alice"""
2,"""Bob"""


In [39]:
# For [{key: value}, ...], polars try infer the data type
data = [
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"},
]
df = pl.DataFrame(data)
df

id,name
i64,str
1,"""Alice"""
2,"""Bob"""


In [40]:
# For [{key: value}, ...], I suggest to define schema explicitly
data = [
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"},
]
df = pl.DataFrame(data, schema={"id": int, "name": str})
df

id,name
i64,str
1,"""Alice"""
2,"""Bob"""


## Handle Type Mismatch

In [41]:
# For [{key: value}, ...], I suggest to define schema explicitly
data = [
    {"id": 1, "name": "Alice", "bank_account": "1111111111"},
    {"id": 2, "name": "Bob", "bank_account": "2222222222"},
    {"id": 3, "name": "Cathy", "bank_account": 3333333333},
]
df = pl.DataFrame(data)
df

id,name,bank_account
i64,str,str
1,"""Alice""","""1111111111"""
2,"""Bob""","""2222222222"""
3,"""Cathy""","""3333333333"""


In [42]:
data = [
    {"id": 1, "name": "Alice", "bank_account": "1111111111"},
    {"id": 2, "name": "Bob", "bank_account": "2222222222"},
    {"id": 3, "name": "Cathy", "bank_account": 3333333333},
]
df = pl.DataFrame(data, schema={"id": int, "name": str, "bank_account": str})
df

id,name,bank_account
i64,str,str
1,"""Alice""","""1111111111"""
2,"""Bob""","""2222222222"""
3,"""Cathy""","""3333333333"""
