In [2]:
import polars as pl
import faker

fake = faker.Faker()

n_row = 50
data = {
    "id": list(range(1, 1+n_row)),
    "name": [fake.name() for _ in range(n_row)],
    "phone": [fake.phone_number() for _ in range(n_row)],
}
df = pl.DataFrame(data)
df

id,name,phone
i64,str,str
1,"""Maria Powell""","""836.602.8286x06763"""
2,"""Jacqueline Fletcher PhD""","""+1-801-639-6835"""
3,"""Mary Wilson""","""(999)905-1935"""
4,"""Kenneth Davis""","""332-275-7129x0289"""
5,"""Christopher Martin""","""(792)727-8878x82819"""
…,…,…
46,"""William Keller""","""(695)744-1587x0883"""
47,"""Vickie Perez""","""+1-297-963-9194x6132"""
48,"""William Howard""","""630.391.0772x26252"""
49,"""Laura Nash""","""(495)680-1361x6763"""


## Select by Rows

### Take First N

In [3]:
df.head(5)

id,name,phone
i64,str,str
1,"""Maria Powell""","""836.602.8286x06763"""
2,"""Jacqueline Fletcher PhD""","""+1-801-639-6835"""
3,"""Mary Wilson""","""(999)905-1935"""
4,"""Kenneth Davis""","""332-275-7129x0289"""
5,"""Christopher Martin""","""(792)727-8878x82819"""


### Take Last N

In [4]:
df.tail(5)

id,name,phone
i64,str,str
46,"""William Keller""","""(695)744-1587x0883"""
47,"""Vickie Perez""","""+1-297-963-9194x6132"""
48,"""William Howard""","""630.391.0772x26252"""
49,"""Laura Nash""","""(495)680-1361x6763"""
50,"""Donald Johnson""","""(282)973-8109x024"""


### Select By Index Range

Note: [Polars does not have a multi-index/index](https://docs.pola.rs/user-guide/migration/pandas/#polars-does-not-have-a-multi-indexindex), you have to use integer index to select rows.

In [5]:
# Method 1
df.slice(offset=5, length=10)

id,name,phone
i64,str,str
6,"""Patricia Brown""","""363-719-4855x7138"""
7,"""Richard Hodges""","""(831)255-4407x13496"""
8,"""Steve Green""","""671-700-3796x127"""
9,"""Luis Smith""","""844.513.7054x4915"""
10,"""Morgan Hensley""","""001-741-524-3690x958"""
11,"""Tanya Peck""","""819-449-4406"""
12,"""Mr. Joseph Parrish""","""409-954-9495x85424"""
13,"""Juan Frazier""","""282.948.5514x337"""
14,"""David Murray""","""902-780-8331x8359"""
15,"""Ryan Campbell""","""7713098760"""


In [6]:
# Method 2
start_idx = 6
end_idx = 15
df.slice(offset=start_idx-1, length=(end_idx-start_idx+1))

id,name,phone
i64,str,str
6,"""Patricia Brown""","""363-719-4855x7138"""
7,"""Richard Hodges""","""(831)255-4407x13496"""
8,"""Steve Green""","""671-700-3796x127"""
9,"""Luis Smith""","""844.513.7054x4915"""
10,"""Morgan Hensley""","""001-741-524-3690x958"""
11,"""Tanya Peck""","""819-449-4406"""
12,"""Mr. Joseph Parrish""","""409-954-9495x85424"""
13,"""Juan Frazier""","""282.948.5514x337"""
14,"""David Murray""","""902-780-8331x8359"""
15,"""Ryan Campbell""","""7713098760"""


### Select One Row By Index

In [7]:
# Don't do df[5], it will return a dataframe
df.row(5)

(6, 'Patricia Brown', '363-719-4855x7138')

In [8]:
df.row(5, named=True)

{'id': 6, 'name': 'Patricia Brown', 'phone': '363-719-4855x7138'}

### Select Rows by Multiple Index

In [9]:
df[[1, 3, 5]]

id,name,phone
i64,str,str
2,"""Jacqueline Fletcher PhD""","""+1-801-639-6835"""
4,"""Kenneth Davis""","""332-275-7129x0289"""
6,"""Patricia Brown""","""363-719-4855x7138"""


### Randomly Sample By Rows

In [10]:
df.sample(n=5)

id,name,phone
i64,str,str
24,"""Erica Keller""","""864.464.0633"""
46,"""William Keller""","""(695)744-1587x0883"""
18,"""James Miller""","""369.515.1819x923"""
21,"""Daniel Walker MD""","""+1-832-758-6101x9324"""
12,"""Mr. Joseph Parrish""","""409-954-9495x85424"""


In [11]:
df.sample(fraction=0.1)

id,name,phone
i64,str,str
30,"""Sarah Pittman""","""6854545835"""
11,"""Tanya Peck""","""819-449-4406"""
8,"""Steve Green""","""671-700-3796x127"""
17,"""Andrew Lee""","""754-587-2094"""
40,"""Francisco Gonzalez""","""001-427-821-4543x436"""


In [12]:
df_res = df.sample(fraction=0.5, with_replacement=True)
df_res

id,name,phone
i64,str,str
14,"""David Murray""","""902-780-8331x8359"""
12,"""Mr. Joseph Parrish""","""409-954-9495x85424"""
24,"""Erica Keller""","""864.464.0633"""
20,"""Lindsay Meza""","""466-888-0910x674"""
35,"""Patricia Simmons""","""(816)705-8827x9632"""
…,…,…
24,"""Erica Keller""","""864.464.0633"""
24,"""Erica Keller""","""864.464.0633"""
10,"""Morgan Hensley""","""001-741-524-3690x958"""
42,"""Tina Allen""","""669.671.7571"""


In [13]:
# most likely it won't be n_row / 2
df_res.n_unique()

16

## Select By Columns

### Select one Column

In [14]:
df.select("name")

name
str
"""Maria Powell"""
"""Jacqueline Fletcher PhD"""
"""Mary Wilson"""
"""Kenneth Davis"""
"""Christopher Martin"""
…
"""William Keller"""
"""Vickie Perez"""
"""William Howard"""
"""Laura Nash"""


### Select multiple Column

In [15]:
df.select(["id", "name"])

id,name
i64,str
1,"""Maria Powell"""
2,"""Jacqueline Fletcher PhD"""
3,"""Mary Wilson"""
4,"""Kenneth Davis"""
5,"""Christopher Martin"""
…,…
46,"""William Keller"""
47,"""Vickie Perez"""
48,"""William Howard"""
49,"""Laura Nash"""


In [16]:
columns = list(df.schema)
df.select(columns[:2])

id,name
i64,str
1,"""Maria Powell"""
2,"""Jacqueline Fletcher PhD"""
3,"""Mary Wilson"""
4,"""Kenneth Davis"""
5,"""Christopher Martin"""
…,…
46,"""William Keller"""
47,"""Vickie Perez"""
48,"""William Howard"""
49,"""Laura Nash"""


In [17]:
df.select(columns[-2:])

name,phone
str,str
"""Maria Powell""","""836.602.8286x06763"""
"""Jacqueline Fletcher PhD""","""+1-801-639-6835"""
"""Mary Wilson""","""(999)905-1935"""
"""Kenneth Davis""","""332-275-7129x0289"""
"""Christopher Martin""","""(792)727-8878x82819"""
…,…
"""William Keller""","""(695)744-1587x0883"""
"""Vickie Perez""","""+1-297-963-9194x6132"""
"""William Howard""","""630.391.0772x26252"""
"""Laura Nash""","""(495)680-1361x6763"""


## Select By Both Rows and Columns

In [18]:
df.select(["id", "name"])[[1, 3, 5]]

id,name
i64,str
2,"""Jacqueline Fletcher PhD"""
4,"""Kenneth Davis"""
6,"""Patricia Brown"""


### Select a Specific Cell

In [19]:
df.item(row=0, column="id")

1

In [20]:
df.item(row=0, column=0)

1

### Iterate Over Columns

In [21]:
for series in df.iter_columns():
    print(f"--- {series.name = }")
    print(f"{type(series) = }")
    print(f"{series = }")

--- series.name = 'id'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'id' [i64]
[
	1
	2
	3
	4
	5
	…
	46
	47
	48
	49
	50
]
--- series.name = 'name'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'name' [str]
[
	"Maria Powell"
	"Jacqueline Fletcher PhD"
	"Mary Wilson"
	"Kenneth Davis"
	"Christopher Martin"
	…
	"William Keller"
	"Vickie Perez"
	"William Howard"
	"Laura Nash"
	"Donald Johnson"
]
--- series.name = 'phone'
type(series) = <class 'polars.series.series.Series'>
series = shape: (50,)
Series: 'phone' [str]
[
	"836.602.8286x06763"
	"+1-801-639-6835"
	"(999)905-1935"
	"332-275-7129x0289"
	"(792)727-8878x82819"
	…
	"(695)744-1587x0883"
	"+1-297-963-9194x6132"
	"630.391.0772x26252"
	"(495)680-1361x6763"
	"(282)973-8109x024"
]


### Iterate Over Rows


In [22]:
for row in df.iter_rows():
    print(f"--- {row[0] = }")
    print(f"{type(row) = }")
    print(f"{row = }")

--- row[0] = 1
type(row) = <class 'tuple'>
row = (1, 'Maria Powell', '836.602.8286x06763')
--- row[0] = 2
type(row) = <class 'tuple'>
row = (2, 'Jacqueline Fletcher PhD', '+1-801-639-6835')
--- row[0] = 3
type(row) = <class 'tuple'>
row = (3, 'Mary Wilson', '(999)905-1935')
--- row[0] = 4
type(row) = <class 'tuple'>
row = (4, 'Kenneth Davis', '332-275-7129x0289')
--- row[0] = 5
type(row) = <class 'tuple'>
row = (5, 'Christopher Martin', '(792)727-8878x82819')
--- row[0] = 6
type(row) = <class 'tuple'>
row = (6, 'Patricia Brown', '363-719-4855x7138')
--- row[0] = 7
type(row) = <class 'tuple'>
row = (7, 'Richard Hodges', '(831)255-4407x13496')
--- row[0] = 8
type(row) = <class 'tuple'>
row = (8, 'Steve Green', '671-700-3796x127')
--- row[0] = 9
type(row) = <class 'tuple'>
row = (9, 'Luis Smith', '844.513.7054x4915')
--- row[0] = 10
type(row) = <class 'tuple'>
row = (10, 'Morgan Hensley', '001-741-524-3690x958')
--- row[0] = 11
type(row) = <class 'tuple'>
row = (11, 'Tanya Peck', '819-449

### Iterate Over Slices

Sub DataFrames with a fewer rows

In [23]:
# When total number of row is multiplier of ``n_rows`` 
for ith_df, sub_df in enumerate(df.iter_slices(n_rows=n_row // 5), start=1):
    print(f"--- {ith_df = }")
    print(sub_df)

--- ith_df = 1
shape: (10, 3)
┌─────┬─────────────────────────┬──────────────────────┐
│ id  ┆ name                    ┆ phone                │
│ --- ┆ ---                     ┆ ---                  │
│ i64 ┆ str                     ┆ str                  │
╞═════╪═════════════════════════╪══════════════════════╡
│ 1   ┆ Maria Powell            ┆ 836.602.8286x06763   │
│ 2   ┆ Jacqueline Fletcher PhD ┆ +1-801-639-6835      │
│ 3   ┆ Mary Wilson             ┆ (999)905-1935        │
│ 4   ┆ Kenneth Davis           ┆ 332-275-7129x0289    │
│ 5   ┆ Christopher Martin      ┆ (792)727-8878x82819  │
│ 6   ┆ Patricia Brown          ┆ 363-719-4855x7138    │
│ 7   ┆ Richard Hodges          ┆ (831)255-4407x13496  │
│ 8   ┆ Steve Green             ┆ 671-700-3796x127     │
│ 9   ┆ Luis Smith              ┆ 844.513.7054x4915    │
│ 10  ┆ Morgan Hensley          ┆ 001-741-524-3690x958 │
└─────┴─────────────────────────┴──────────────────────┘
--- ith_df = 2
shape: (10, 3)
┌─────┬────────────────────┬

In [24]:
# When total number of row is NOT multiplier of ``n_rows``
# It's ok that the last sub dataframe doesn't have enough rows
for ith_df, sub_df in enumerate(df.iter_slices(n_rows=13), start=1):
    print(f"--- {ith_df = }")
    print(sub_df)

--- ith_df = 1
shape: (13, 3)
┌─────┬─────────────────────────┬──────────────────────┐
│ id  ┆ name                    ┆ phone                │
│ --- ┆ ---                     ┆ ---                  │
│ i64 ┆ str                     ┆ str                  │
╞═════╪═════════════════════════╪══════════════════════╡
│ 1   ┆ Maria Powell            ┆ 836.602.8286x06763   │
│ 2   ┆ Jacqueline Fletcher PhD ┆ +1-801-639-6835      │
│ 3   ┆ Mary Wilson             ┆ (999)905-1935        │
│ 4   ┆ Kenneth Davis           ┆ 332-275-7129x0289    │
│ 5   ┆ Christopher Martin      ┆ (792)727-8878x82819  │
│ …   ┆ …                       ┆ …                    │
│ 9   ┆ Luis Smith              ┆ 844.513.7054x4915    │
│ 10  ┆ Morgan Hensley          ┆ 001-741-524-3690x958 │
│ 11  ┆ Tanya Peck              ┆ 819-449-4406         │
│ 12  ┆ Mr. Joseph Parrish      ┆ 409-954-9495x85424   │
│ 13  ┆ Juan Frazier            ┆ 282.948.5514x337     │
└─────┴─────────────────────────┴──────────────────────┘
-

## Concatenate

### Concatenate Vertically (More rows, same columns)

``DataFrame.extend`` will edit the first DataFrame **in-place**!

In [25]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6]})
df1.extend(df2)

id
i64
1
2
3
4
5
6


In [26]:
df1

id
i64
1
2
3
4
5
6


``DataFrame.vstack`` **DOES NOT** edit the first DataFrame **in-place**!

In [27]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6]})
df1.vstack(df2)

id
i64
1
2
3
4
5
6


In [28]:
df1

id
i64
1
2
3


In [30]:
# It won't work because the columns are different
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"id": [4, 5, 6], "name": ["d", "e", "f"]})
try:
    df1.vstack(df2)
except Exception as e:
    print(repr(e))

ShapeError('unable to append to a DataFrame of width 1 with a DataFrame of width 2')


### Concatenate Horizontally (More columns, same rows)

``DataFrame.hstack`` **DOES NOT** edit the first DataFrame **in-place**!

In [31]:
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"name": ["a", "b", "c"]})
df1.hstack(df2)

id,name
i64,str
1,"""a"""
2,"""b"""
3,"""c"""


In [32]:
df1

id
i64
1
2
3


In [33]:
# It won't work because the rows are different
df1 = pl.DataFrame({"id": [1, 2, 3]})
df2 = pl.DataFrame({"name": ["a", "b", "c", "d", "e"]})
try:
    df1.hstack(df2)
except Exception as e:
    print(repr(e))

ShapeError('could not create a new DataFrame: series "id" has length 3 while series "name" has length 5')
