## Data Types

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

#### Integral types

In [8]:
print(pd.Series(range(3), dtype=pd.Int64Dtype()))
print("--------------")
print(pd.Series(range(3), dtype=pd.Int8Dtype()))
print("--------------")
print(pd.Series([1, pd.NA, 2], dtype=pd.Int64Dtype()))
print("--------------")
print(pd.Series([1, None, 2], dtype=pd.Int64Dtype()))
print("--------------")
print(pd.Series(range(555, 558), dtype=pd.Int16Dtype()))
print("--------------")
print(pd.Series(range(3), dtype=pd.UInt8Dtype()))


0    0
1    1
2    2
dtype: Int64
--------------
0    0
1    1
2    2
dtype: Int8
--------------
0       1
1    <NA>
2       2
dtype: Int64
--------------
0       1
1    <NA>
2       2
dtype: Int64
--------------
0    555
1    556
2    557
dtype: Int16
--------------
0    0
1    1
2    2
dtype: UInt8


### Floating point types

In [12]:
print(pd.Series([3.14, .333333333, -123.456], dtype=pd.Float64Dtype()))
print("--------------")
print(pd.Series([3.14, None, pd.NA], dtype=pd.Float64Dtype()))
print("--------------")
ser1 = pd.Series([1_000_000.123], dtype=pd.Float32Dtype())
ser2 = pd.Series([1_000_000.124], dtype=pd.Float32Dtype())
print(ser1.eq(ser2))

0        3.14
1    0.333333
2    -123.456
dtype: Float64
--------------
0    3.14
1    <NA>
2    <NA>
dtype: Float64
--------------
0    True
dtype: boolean


### Boolean types

In [13]:
pd.Series([True, False, True], dtype=pd.BooleanDtype())

0     True
1    False
2     True
dtype: boolean

In [14]:
pd.Series([1, 0, 1], dtype=pd.BooleanDtype())

0     True
1    False
2     True
dtype: boolean

### String types

In [20]:
print(pd.Series(["foo", "bar", "baz"], dtype=pd.StringDtype()))
print("--------------")
ser = pd.Series(["xx", "YyY", "zZzZ"], dtype=pd.StringDtype())
print(ser.str.len())
print("--------------")
print(ser.str.upper(),"\n",ser.str.lower(),"\n", ser.str.title())

0    foo
1    bar
2    baz
dtype: string
--------------
0    2
1    3
2    4
dtype: Int64
--------------
0      XX
1     YYY
2    ZZZZ
dtype: string 
 0      xx
1     yyy
2    zzzz
dtype: string 
 0      Xx
1     Yyy
2    Zzzz
dtype: string


In [22]:
ser = pd.Series(["foo", "bar", "baz"], dtype=pd.StringDtype())
print(ser.str.contains("o"))
print("--------------")
ser.str.contains(r"^ba[rz]$", case=False, regex=True)

0     True
1    False
2    False
dtype: boolean
--------------


0    False
1     True
2     True
dtype: boolean

### Missing value handling

In [23]:
ser = pd.Series(range(3))
ser

0    0
1    1
2    2
dtype: int64

In [26]:
ser.iloc[1] = None
print(ser)
print("--------------")
print(pd.isna(pd.Series([1, np.nan, 2])))
print("--------------")
print(pd.isna(pd.Series([1, pd.NA, 2], dtype=pd.Int64Dtype())))

0    0.0
1    NaN
2    2.0
dtype: float64
--------------
0    False
1     True
2    False
dtype: bool
--------------
0    False
1     True
2    False
dtype: bool


In [27]:
ser = pd.Series(range(3), dtype=pd.Int64Dtype())
mask = pd.Series([True, pd.NA, False], dtype=pd.BooleanDtype())
ser[mask]

0    0
dtype: Int64

### Categorical types

In [29]:
values = ["foo", "bar", "baz"]
values_ser = pd.Series(values, dtype=pd.StringDtype())
ser = values_ser.astype(pd.CategoricalDtype())
ser

0    foo
1    bar
2    baz
dtype: category
Categories (3, string): [bar, baz, foo]

In [30]:
cat = pd.CategoricalDtype(values_ser)
ser = pd.Series(values, dtype=cat)
ser

0    foo
1    bar
2    baz
dtype: category
Categories (3, string): [foo, bar, baz]

In [31]:
ser.iloc[2] = "foo"
ser

0    foo
1    bar
2    foo
dtype: category
Categories (3, string): [foo, bar, baz]

In [32]:
shirt_sizes = pd.Series(["S", "M", "L", "XL"], dtype=pd.StringDtype())
cat = pd.CategoricalDtype(shirt_sizes, ordered=True)
ser = pd.Series(["XL", "L", "S", "L", "S", "M"], dtype=cat)
ser < "L"

0    False
1    False
2     True
3    False
4     True
5     True
dtype: bool

In [33]:
accepted_values = pd.Series(["foo", "bar"], dtype=pd.StringDtype())
cat = pd.CategoricalDtype(accepted_values)
ser = pd.Series(["foo", "bar", "foo"], dtype=cat)
ser

0    foo
1    bar
2    foo
dtype: category
Categories (2, string): [foo, bar]

In [35]:
print(ser.cat.codes)
print("--------------")
print(ser.cat.categories)

0    0
1    1
2    0
dtype: int8
--------------
Index(['foo', 'bar'], dtype='string')


In [37]:
print(pd.Series(["foo", "bar", "baz"] * 100, dtype=pd.StringDtype()).memory_usage())
print("--------------")
print(pd.Series(["foo", "bar", "baz"] * 100, dtype=cat).memory_usage())
print("--------------")
print(pd.Series(["foo", "bar", pd.NA], dtype=pd.CategoricalDtype()))

2532
--------------
556
--------------
0    foo
1    bar
2    NaN
dtype: category
Categories (2, object): ['bar', 'foo']


### Temporal types - datetime

In [38]:
ser = pd.Series([
    "2024-01-01 00:00:00",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:02"
], dtype="datetime64[ns]")
ser

0   2024-01-01 00:00:00
1   2024-01-02 00:00:01
2   2024-01-03 00:00:02
dtype: datetime64[ns]

In [39]:
ser = pd.Series([
    "2024-01-01",
    "2024-01-02",
    "2024-01-03"
], dtype="datetime64[ns]")
ser

0   2024-01-01
1   2024-01-02
2   2024-01-03
dtype: datetime64[ns]

In [44]:
ser.iloc[1] = "2024-01-04 00:00:42"
print(ser)
print("--------------")
print(ser.dt.year)
print("--------------")
print(ser.dt.month)
print("--------------")
print(ser.dt.day_of_week)
print("--------------")
print(ser.dt.day)
print("--------------")
print(ser.dt.hour)
print("--------------")
print(ser.dt.minute)

0   2024-01-01 00:00:00
1   2024-01-04 00:00:42
2   2024-01-03 00:00:00
dtype: datetime64[ns]
--------------
0    2024
1    2024
2    2024
dtype: int32
--------------
0    1
1    1
2    1
dtype: int32
--------------
0    0
1    3
2    2
dtype: int32
--------------
0    1
1    4
2    3
dtype: int32
--------------
0    0
1    0
2    0
dtype: int32
--------------
0    0
1    0
2    0
dtype: int32


In [45]:
pd.Series([
    "2024-01-01 00:00:01",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:01"
], dtype=pd.DatetimeTZDtype(tz="UTC"))

0   2024-01-01 00:00:01+00:00
1   2024-01-02 00:00:01+00:00
2   2024-01-03 00:00:01+00:00
dtype: datetime64[ns, UTC]

In [46]:
pd.Series([
    "2024-01-01 00:00:01",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:01"
], dtype=pd.DatetimeTZDtype(tz="America/New_York"))

0   2024-01-01 00:00:01-05:00
1   2024-01-02 00:00:01-05:00
2   2024-01-03 00:00:01-05:00
dtype: datetime64[ns, America/New_York]

In [47]:
pd.Series([
    "2024-01-01 00:00:01",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:01"
], dtype=pd.DatetimeTZDtype(tz="-05:00"))

0   2024-01-01 00:00:01-05:00
1   2024-01-02 00:00:01-05:00
2   2024-01-03 00:00:01-05:00
dtype: datetime64[ns, UTC-05:00]

In [48]:
ser_no_tz = pd.Series([
    "2024-01-01 00:00:00",
    "2024-01-01 00:01:10",
    "2024-01-01 00:02:42"
], dtype="datetime64[ns]")
ser_et = ser_no_tz.dt.tz_localize("America/New_York")
ser_et

0   2024-01-01 00:00:00-05:00
1   2024-01-01 00:01:10-05:00
2   2024-01-01 00:02:42-05:00
dtype: datetime64[ns, America/New_York]

In [49]:
ser_pt = ser_et.dt.tz_convert("America/Los_Angeles")
ser_pt

0   2023-12-31 21:00:00-08:00
1   2023-12-31 21:01:10-08:00
2   2023-12-31 21:02:42-08:00
dtype: datetime64[ns, America/Los_Angeles]

In [50]:
ser_pt.dt.normalize()

0   2023-12-31 00:00:00-08:00
1   2023-12-31 00:00:00-08:00
2   2023-12-31 00:00:00-08:00
dtype: datetime64[ns, America/Los_Angeles]

In [51]:
ser = pd.Series([
    "2024-01-01",
    None,
    "2024-01-03"
], dtype="datetime64[ns]")
ser

0   2024-01-01
1          NaT
2   2024-01-03
dtype: datetime64[ns]

In [52]:
pd.isna(ser)

0    False
1     True
2    False
dtype: bool

In [53]:
pd.Series([
    "1500-01-01 00:00:01",
    "2500-01-01 00:00:01",
], dtype="datetime64[us]")

0   1500-01-01 00:00:01
1   2500-01-01 00:00:01
dtype: datetime64[us]

### Temporal types - Timedelta

In [54]:
ser = pd.Series([
    "2024-01-01",
    "2024-01-02",
    "2024-01-03"
], dtype="datetime64[ns]")
ser - pd.Timestamp("2023-12-31 12:00:00")

0   0 days 12:00:00
1   1 days 12:00:00
2   2 days 12:00:00
dtype: timedelta64[ns]

In [55]:
ser + pd.Timedelta("3 days")

0   2024-01-04
1   2024-01-05
2   2024-01-06
dtype: datetime64[ns]

In [56]:
pd.Series([
    "-1 days",
    "6 hours",
    "42 minutes",
    "12 seconds",
    "8 milliseconds",
    "4 microseconds",
    "300 nanoseconds",
], dtype="timedelta64[ns]")

0           -1 days +00:00:00
1             0 days 06:00:00
2             0 days 00:42:00
3             0 days 00:00:12
4      0 days 00:00:00.008000
5      0 days 00:00:00.000004
6   0 days 00:00:00.000000300
dtype: timedelta64[ns]

### Temporal PyArrow types

In [57]:
ser = pd.Series([
    "2024-01-01",
    "2024-01-02",
    "2024-01-03",
], dtype=pd.ArrowDtype(pa.date32()))
ser

0    2024-01-01
1    2024-01-02
2    2024-01-03
dtype: date32[day][pyarrow]

In [58]:
ser = pd.Series([
    "9999-12-29",
    "9999-12-30",
    "9999-12-31",
], dtype=pd.ArrowDtype(pa.date32()))
ser

0    9999-12-29
1    9999-12-30
2    9999-12-31
dtype: date32[day][pyarrow]

### PyArrow List types

In [60]:
df = pd.DataFrame({
    "name": ["Alice", "Bob", "Janice", "Jim", "Michael"],
    "years_exp": [10, 2, 4, 8, 6],
})
df

Unnamed: 0,name,years_exp
0,Alice,10
1,Bob,2
2,Janice,4
3,Jim,8
4,Michael,6


In [61]:
ser = pd.Series([
    ["Bob", "Michael"],
    None,
    None,
    ["Janice"],
    None,
], dtype=pd.ArrowDtype(pa.list_(pa.string())))
df["direct_reports"] = ser
df

Unnamed: 0,name,years_exp,direct_reports
0,Alice,10,['Bob' 'Michael']
1,Bob,2,
2,Janice,4,
3,Jim,8,['Janice']
4,Michael,6,


In [64]:
print(ser.list.len())
print("--------------")
print(ser.list[0])
print("--------------")
print(ser.list.flatten())

0       2
1    <NA>
2    <NA>
3       1
4    <NA>
dtype: int32[pyarrow]
--------------
0       Bob
1      <NA>
2      <NA>
3    Janice
4      <NA>
dtype: string[pyarrow]
--------------
0        Bob
1    Michael
2     Janice
dtype: string[pyarrow]


### NumPy type system, the object type, and pitfalls

In [66]:
print(pd.Series([0, 1, 2]))
print("--------------")
print(pd.Series([0, None, 2]))

0    0
1    1
2    2
dtype: int64
--------------
0    0.0
1    NaN
2    2.0
dtype: float64


In [70]:
ser = pd.Series([0, None, 2])
ser.fillna(0).astype(int)

0    0
1    0
2    2
dtype: int64

In [72]:
print(pd.Series([0, None, 2]).fillna(0).astype(int).mean())
print("--------------")
print(pd.Series([0, None, 2], dtype=pd.Int64Dtype()).mean())

0.6666666666666666
--------------
1.0


In [73]:
import timeit
func = lambda: pd.Series([0, None, 2]).fillna(0).astype(int).mean()
timeit.timeit(func, number=10_000)

2.7745231999870157

In [74]:
func = lambda: pd.Series([0, None, 2], dtype=pd.Int64Dtype()).mean()
timeit.timeit(func, number=10_000)

2.1524763999914285

In [75]:
print(pd.Series([True, False]))
print("--------------")
print(pd.Series([True, False, None]))

0     True
1    False
dtype: bool
--------------
0     True
1    False
2     None
dtype: object


In [76]:
pd.Series([True, False, None, "one of these things", ["is not like"], ["the other"]])

0                   True
1                  False
2                   None
3    one of these things
4          [is not like]
5            [the other]
dtype: object

In [77]:
pd.Series([True, False, None], dtype=pd.BooleanDtype())

0     True
1    False
2     <NA>
dtype: boolean

In [78]:
pd.Series(["foo", "bar", "baz"])

0    foo
1    bar
2    baz
dtype: object

In [79]:
ser = pd.Series(["foo", "bar", "baz"])
ser.iloc[2] = 42
ser

0    foo
1    bar
2     42
dtype: object

In [80]:
alist = [42, "foo", ["sub", "list"], {"key": "value"}]
ser = pd.Series(alist)
ser

0                  42
1                 foo
2         [sub, list]
3    {'key': 'value'}
dtype: object

In [81]:
df = pd.DataFrame([
    ["foo", 1, 123.45],
    ["bar", 2, 333.33],
    ["baz", 3, 999.99],
], columns=list("abc"))
df

Unnamed: 0,a,b,c
0,foo,1,123.45
1,bar,2,333.33
2,baz,3,999.99


In [82]:
df.dtypes

a     object
b      int64
c    float64
dtype: object

In [83]:
df.astype({
    "a": pd.StringDtype(),
    "b": pd.Int64Dtype(),
    "c": pd.Float64Dtype(),
}).dtypes

a    string[python]
b             Int64
c           Float64
dtype: object

In [84]:
df.convert_dtypes(dtype_backend="numpy_nullable").dtypes

a    string[python]
b             Int64
c           Float64
dtype: object