## Selection and assignment

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

## basic selection of Series.pd

In [2]:
# basic selections
ser = pd.Series(list("abc") * 3)
ser

0    a
1    b
2    c
3    a
4    b
5    c
6    a
7    b
8    c
dtype: object

In [10]:
print(ser[3])
print(ser[[3]])
print(ser[[0,2]])
print(ser[:3])
print(ser[-4:])
print(ser[2:6])
print(ser[1:8:3])

a
3    a
dtype: object
0    a
2    c
dtype: object
0    a
1    b
2    c
dtype: object
5    c
6    a
7    b
8    c
dtype: object
2    c
3    a
4    b
5    c
dtype: object
1    b
4    b
7    b
dtype: object


In [11]:
ser = pd.Series(range(3), index=["Jack", "Jill", "Jayne"])
ser

Jack     0
Jill     1
Jayne    2
dtype: int64

In [13]:
print(ser['Jack'])
print(ser[['Jill']])

0
Jill    1
dtype: int64


In [15]:
ser = pd.Series(list("abc"), index=[2, 42, 21])
print(ser)
print(ser[2])

2     a
42    b
21    c
dtype: object
a


In [16]:
ser = pd.Series(["apple", "banana", "orange"], index=[0, 1, 1])
ser

0     apple
1    banana
1    orange
dtype: object

In [17]:
ser[1]

1    banana
1    orange
dtype: object

## Selection of pd.DataFrame

In [18]:
df = pd.DataFrame(np.arange(9).reshape(3, -1), columns=["a", "b", "c"])
df

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [22]:
print(df["a"])
print(df[["a"]])
print("-----------")
print(df[["a", "b"]])
print("-----------")
print(df[0:2])
print("-----------")
print(df[["b", "a"]])

0    0
1    3
2    6
Name: a, dtype: int64
   a
0  0
1  3
2  6
-----------
   a  b
0  0  1
1  3  4
2  6  7
-----------
   a  b  c
0  0  1  2
1  3  4  5
-----------
   b  a
0  1  0
1  4  3
2  7  6


## position based selection of Series.pd

In [23]:
ser = pd.Series(["apple", "banana", "orange"], index=[0, 1, 1])
ser

0     apple
1    banana
1    orange
dtype: object

In [29]:
print(ser.iloc[1])
print(ser.iloc[[1]])
print("-----------")
print(ser.iloc[0:2]) 
print(ser.iloc[[0,2]])
print("-----------")
print(ser.iloc[:2])

banana
1    banana
dtype: object
-----------
0     apple
1    banana
dtype: object
0     apple
1    orange
dtype: object
-----------
0     apple
1    banana
dtype: object


## Position based selection of pd.DataFrame

In [33]:
df = pd.DataFrame(np.arange(20).reshape(5,-1), columns=list("abcd"))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [41]:
print(df.iloc[1])
print(df.iloc[2,2])
print("-----------")
print(df.iloc[:,0])
print("-----------")
print(df.iloc[0,:])
print("-----------")
print(df.iloc[:,[0]])
print("-----------")
print(df.iloc[[0], :])
print("-----------")
print(df.iloc[[0, 1], [-1, -2]])

a    4
b    5
c    6
d    7
Name: 1, dtype: int64
10
-----------
0     0
1     4
2     8
3    12
4    16
Name: a, dtype: int64
-----------
a    0
b    1
c    2
d    3
Name: 0, dtype: int64
-----------
    a
0   0
1   4
2   8
3  12
4  16
-----------
   a  b  c  d
0  0  1  2  3
-----------
   d  c
0  3  2
1  7  6


## label-base selection Series.pd

In [42]:
ser = pd.Series(["apple", "banana", "orange"], index=[0, 1, 1])
ser

0     apple
1    banana
1    orange
dtype: object

In [43]:
print(ser.loc[1])

1    banana
1    orange
dtype: object


In [44]:
ser = pd.Series([2, 2, 4], index=["dog", "cat", "human"], name="num_legs")
ser

dog      2
cat      2
human    4
Name: num_legs, dtype: int64

In [45]:
print(ser.loc['dog'])
print(ser.loc[['cat','human']])

2
cat      2
human    4
Name: num_legs, dtype: int64


In [46]:
values = ["Jack", "Jill", "Jayne"]
ser = pd.Series(values)
ser

0     Jack
1     Jill
2    Jayne
dtype: object

In [48]:
print(values[:2])
print(ser.iloc[:2])


['Jack', 'Jill']
0    Jack
1    Jill
dtype: object


In [49]:
repeats_2 = pd.Series(range(5), index=[0, 1, 2, 2, 0])
repeats_2.loc[:2]

0    0
1    1
2    2
2    3
dtype: int64

In [50]:
ser = pd.Series(range(4), index=["zzz", "xxx", "xxx", "yyy"])
ser.loc[:"xxx"]

zzz    0
xxx    1
xxx    2
dtype: int64

## Label base selection of DataFrame.pd

In [54]:
df = pd.DataFrame([
    [24, 180, "blue"],
    [42, 166, "brown"],
    [22, 160, "green"],
], columns=["age", "height_cm", "eye_color"], index=["Jack", "Jill", "Jayne"])
df

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jill,42,166,brown
Jayne,22,160,green


In [56]:
print(df.loc["Jayne", "eye_color"])
print(df.loc[:, "age"])
print(df.loc["Jack", :])
print(df.loc[:, ["age"]])
print(df.loc[["Jack", "Jill"], ["age", "eye_color"]])

green
Jack     24
Jill     42
Jayne    22
Name: age, dtype: int64
age            24
height_cm     180
eye_color    blue
Name: Jack, dtype: object
       age
Jack    24
Jill    42
Jayne   22
      age eye_color
Jack   24      blue
Jill   42     brown


## Mixing position-based and label-based selection of DataFrame

In [57]:
df = pd.DataFrame([
    [24, 180, "blue"],
    [42, 166, "brown"],
    [22, 160, "green"],
], columns=["age", "height_cm", "eye_color"])

df

Unnamed: 0,age,height_cm,eye_color
0,24,180,blue
1,42,166,brown
2,22,160,green


In [59]:
col_idxer = df.columns.get_indexer(["age", "eye_color"])
col_idxer

array([0, 2])

In [61]:
print(df.iloc[[0, 1], col_idxer])
print("--------------")
df[["age", "eye_color"]].iloc[[0, 1]]

   age eye_color
0   24      blue
1   42     brown
--------------


Unnamed: 0,age,eye_color
0,24,blue
1,42,brown


In [65]:
import timeit

def get_indexer_approach():
    col_idxer = df.columns.get_indexer(["age", "eye_color"])
    df.iloc[[0, 1], col_idxer]

timeit.timeit(get_indexer_approach, number=10_000)

6.730973799989442

In [67]:
two_step_approach = lambda: df[["age", "eye_color"]].iloc[[0, 1]]
timeit.timeit(two_step_approach, number=10_000)

7.41815339999448

## pd.DataFrame.filter

In [68]:
df = pd.DataFrame([
    [24, 180, "blue"],
    [42, 166, "brown"],
    [22, 160, "green"],
], columns=[
    "age",
    "height_cm",
    "eye_color"
], index=["Jack", "Jill", "Jayne"])
df

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jill,42,166,brown
Jayne,22,160,green


In [71]:
print(df.filter(["age", "eye_color"]))
print("--------------")
print(df.filter(["Jack", "Jill"], axis=0))
print("--------------")
print(df.filter(like="_"))
print("--------------")
print(df.filter(regex=r"^Ja.*(?<!e)$", axis=0))

       age eye_color
Jack    24      blue
Jill    42     brown
Jayne   22     green
--------------
      age  height_cm eye_color
Jack   24        180      blue
Jill   42        166     brown
--------------
       height_cm eye_color
Jack         180      blue
Jill         166     brown
Jayne        160     green
--------------
      age  height_cm eye_color
Jack   24        180      blue


## Selection by data type (DataFrame.pd)

In [72]:
df = pd.DataFrame([
    [0, 1.0, "2"],
    [4, 8.0, "16"],
], columns=["int_col", "float_col", "string_col"])
df

Unnamed: 0,int_col,float_col,string_col
0,0,1.0,2
1,4,8.0,16


In [76]:
print(df.select_dtypes("int"))
print("--------------")
print(df.select_dtypes(include=["int", "float"]))
print("--------------")
print(df.select_dtypes(exclude=["int", "float"]))

   int_col
0        0
1        4
--------------
   int_col  float_col
0        0        1.0
1        4        8.0
--------------
  string_col
0          2
1         16


#### Selection / filtering via Boolean arrays (Series/DataFrame)

In [77]:
mask = [True, False, True]
ser = pd.Series(range(3))
ser

0    0
1    1
2    2
dtype: int64

In [79]:
print(ser[mask])
print("--------------")
print(ser[mask])

0    0
2    2
dtype: int64
--------------
0    0
2    2
dtype: int64


In [80]:
df = pd.DataFrame(np.arange(6).reshape(3, -1))
df[mask]

Unnamed: 0,0,1
0,0,1
2,4,5


In [81]:
col_mask = [True, False]
df.loc[mask, col_mask]

Unnamed: 0,0
0,0
2,4


In [82]:
df = pd.DataFrame([
    [24, 180, "blue"],
    [42, 166, "brown"],
    [22, 160, "green"],
], columns=["age", "height_cm", "eye_color"], index=["Jack", "Jill", "Jayne"])
df

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jill,42,166,brown
Jayne,22,160,green


In [83]:
blue_eyes = df["eye_color"] == "blue"
blue_eyes

Jack      True
Jill     False
Jayne    False
Name: eye_color, dtype: bool

In [84]:
green_eyes = df["eye_color"] == "green"
green_eyes

Jack     False
Jill     False
Jayne     True
Name: eye_color, dtype: bool

In [85]:
mask = blue_eyes | green_eyes
mask

Jack      True
Jill     False
Jayne     True
Name: eye_color, dtype: bool

In [86]:
df[mask]

Unnamed: 0,age,height_cm,eye_color
Jack,24,180,blue
Jayne,22,160,green


In [87]:
age_lt_40 = df["age"] < 40
age_lt_40

Jack      True
Jill     False
Jayne     True
Name: age, dtype: bool

In [88]:
height_gt_170 = df["height_cm"] > 170
height_gt_170

Jack      True
Jill     False
Jayne    False
Name: height_cm, dtype: bool

In [91]:
print(df[age_lt_40 & height_gt_170])
print("-------------- --------------")
print(df[~(age_lt_40 & height_gt_170)])

      age  height_cm eye_color
Jack   24        180      blue
-------------- --------------
       age  height_cm eye_color
Jill    42        166     brown
Jayne   22        160     green


### Selection with a pd.MultiIndex - single level

In [92]:
index = pd.MultiIndex.from_tuples([
    ("John", "Smith"),
    ("John", "Doe"),
    ("Jane", "Doe"),
    ("Stephen", "Smith"),
], names=["first_name", "last_name"])
ser = pd.Series(range(4), index=index)
ser

first_name  last_name
John        Smith        0
            Doe          1
Jane        Doe          2
Stephen     Smith        3
dtype: int64

In [95]:
print(ser.loc["John"])
print("--------------")
print(ser.loc[["John"]])

last_name
Smith    0
Doe      1
dtype: int64
--------------
first_name  last_name
John        Smith        0
            Doe          1
dtype: int64


### Selection with a pd.MultiIndex - multiple levels

In [96]:
index = pd.MultiIndex.from_tuples([
    ("John", "Smith"),
    ("John", "Doe"),
    ("Jane", "Doe"),
    ("Stephen", "Smith"),
], names=["first_name", "last_name"])
ser = pd.Series(range(4), index=index)
ser

first_name  last_name
John        Smith        0
            Doe          1
Jane        Doe          2
Stephen     Smith        3
dtype: int64

In [101]:
print(ser.loc[("Jane", "Doe")])
print("--------------")
print(ser.loc[(["Jane"], "Doe")])
print("--------------")
print(ser.loc[(["Jane", "John"], ["Doe", "Smith"])])
print("--------------")
print(ser.loc[(slice(None), "Doe")])
print("--------------")
print(ser.loc[(slice(None), ["Doe"])])

2
--------------
first_name  last_name
Jane        Doe          2
dtype: int64
--------------
first_name  last_name
Jane        Doe          2
John        Doe          1
            Smith        0
dtype: int64
--------------
first_name
John    1
Jane    2
dtype: int64
--------------
first_name  last_name
John        Doe          1
Jane        Doe          2
dtype: int64


In [102]:
alist = list("abc")
alist[:]

['a', 'b', 'c']

In [105]:
print(alist[slice(None)])
ixsl = pd.IndexSlice
print(ser.loc[ixsl[:, ["Doe"]]])

['a', 'b', 'c']
first_name  last_name
John        Doe          1
Jane        Doe          2
dtype: int64


### Selection with a pd.MultiIndex - pd.DataFrame

In [106]:
row_index = pd.MultiIndex.from_tuples([
    ("John", "Smith"),
    ("John", "Doe"),
    ("Jane", "Doe"),
    ("Stephen", "Smith"),
], names=["first_name", "last_name"])
col_index = pd.MultiIndex.from_tuples([
    ("music", "favorite"),
    ("music", "last_seen_live"),
    ("art", "favorite"),
], names=["art_type", "category"])
df = pd.DataFrame([
   ["Swift", "Swift", "Matisse"],
   ["Mozart", "T. Swift", "Van Gogh"],
   ["Beatles", "Wonder", "Warhol"],
   ["Jackson", "Dylan", "Picasso"],
], index=row_index, columns=col_index)
df

Unnamed: 0_level_0,art_type,music,music,art
Unnamed: 0_level_1,category,favorite,last_seen_live,favorite
first_name,last_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
John,Smith,Swift,Swift,Matisse
John,Doe,Mozart,T. Swift,Van Gogh
Jane,Doe,Beatles,Wonder,Warhol
Stephen,Smith,Jackson,Dylan,Picasso


In [107]:
row_idxer = (slice(None), "Smith")
col_idxer = (slice(None), "favorite")
df.loc[row_idxer, col_idxer]

Unnamed: 0_level_0,art_type,music,art
Unnamed: 0_level_1,category,favorite,favorite
first_name,last_name,Unnamed: 2_level_2,Unnamed: 3_level_2
John,Smith,Swift,Matisse
Stephen,Smith,Jackson,Picasso


In [108]:
print(df.loc[(slice(None), "Smith"), (slice(None), "favorite")])

art_type                music      art
category             favorite favorite
first_name last_name                  
John       Smith        Swift  Matisse
Stephen    Smith      Jackson  Picasso


### Item Assignment with .loc and .iloc

In [109]:
ser = pd.Series(range(3), index=list("abc"))

In [110]:
ser.loc["b"] = 42
ser

a     0
b    42
c     2
dtype: int64

In [111]:
ser.iloc[2] = -42
ser

a     0
b    42
c   -42
dtype: int64

In [112]:
df = pd.DataFrame({"col1": [1, 2, 3]})
df

Unnamed: 0,col1
0,1
1,2
2,3


In [113]:
df["new_column1"] = 42
df

Unnamed: 0,col1,new_column1
0,1,42
1,2,42
2,3,42


In [114]:
df["new_column2"] = list("abc")
df

Unnamed: 0,col1,new_column1,new_column2
0,1,42,a
1,2,42,b
2,3,42,c


In [115]:
df["new_column3"] = pd.Series(["dog", "cat", "human"])
df

Unnamed: 0,col1,new_column1,new_column2,new_column3
0,1,42,a,dog
1,2,42,b,cat
2,3,42,c,human
