## Data Types

In [1]:
import pandas as pd
import numpy as np
# import pyarrow as pa

#### Integral types

In [3]:
print(pd.Series(range(3), dtype=pd.Int64Dtype()))
print("--------------")
print(pd.Series([1, pd.NA, 2,None], dtype=pd.Int64Dtype()))
print("--------------")
print(pd.Series(range(3), dtype=pd.UInt8Dtype()))


0    0
1    1
2    2
dtype: Int64
--------------
0       1
1    <NA>
2       2
3    <NA>
dtype: Int64
--------------
0    0
1    1
2    2
dtype: UInt8


### Floating point types

In [4]:
print(pd.Series([3.14, None, pd.NA], dtype=pd.Float64Dtype()))
print("--------------")
ser1 = pd.Series([1_000_000.123], dtype=pd.Float32Dtype())
ser2 = pd.Series([1_000_000.124], dtype=pd.Float32Dtype())
print(ser1.eq(ser2))

0    3.14
1    <NA>
2    <NA>
dtype: Float64
--------------
0    True
dtype: boolean


### Boolean types

In [9]:
pd.Series([True, False, True], dtype=pd.BooleanDtype())
pd.Series([1, 0, 1], dtype=pd.BooleanDtype())

0     True
1    False
2     True
dtype: boolean

### String types

In [12]:
print(pd.Series(["foo", "bar", "baz"], dtype=pd.StringDtype()))
print("--------------")
ser = pd.Series(["xx", "YyY", "zZzZ"], dtype=pd.StringDtype())
print(ser.str.len())
print("--------------")
print(ser.str.upper(),"\n",ser.str.lower(),"\n", ser.str.title())

0    foo
1    bar
2    baz
dtype: string
--------------
0    2
1    3
2    4
dtype: Int64
--------------
0      XX
1     YYY
2    ZZZZ
dtype: string 
 0      xx
1     yyy
2    zzzz
dtype: string 
 0      Xx
1     Yyy
2    Zzzz
dtype: string


In [15]:
ser = pd.Series(["foo", "bar", "baz"], dtype=pd.StringDtype())
print(ser.str.contains("o"))
print("--------------")
ser.str.contains(r"^ba[rz]$", case=False, regex=True)
# ^ = start, ba = literal "ba", [rz] = r or z, $ = end → matches "bar"/"baz" (case-insensitive)

0     True
1    False
2    False
dtype: boolean
--------------


0    False
1     True
2     True
dtype: boolean

### Missing value handling

In [17]:
ser = pd.Series(range(3))
print(ser)
ser.iloc[1] = None
print(ser)
print("--------------")
print(pd.isna(pd.Series([1, np.nan, 2])))
print("--------------")
print(pd.isna(pd.Series([1, pd.NA, 2], dtype=pd.Int64Dtype())))

0    0
1    1
2    2
dtype: int64
0    0.0
1    NaN
2    2.0
dtype: float64
--------------
0    False
1     True
2    False
dtype: bool
--------------
0    False
1     True
2    False
dtype: bool


### Categorical types

In [18]:
values = ["foo", "bar", "baz"]
values_ser = pd.Series(values, dtype=pd.StringDtype())
ser = values_ser.astype(pd.CategoricalDtype())
print(ser)
cat = pd.CategoricalDtype(values_ser)
ser = pd.Series(values, dtype=cat)
print(ser)

0    foo
1    bar
2    baz
dtype: category
Categories (3, string): [bar, baz, foo]
0    foo
1    bar
2    baz
dtype: category
Categories (3, string): [foo, bar, baz]


In [20]:
shirt_sizes = pd.Series(["S", "M", "L", "XL"], dtype=pd.StringDtype())
cat = pd.CategoricalDtype(shirt_sizes, ordered=True)
ser = pd.Series(["XL", "L", "S", "L", "S", "M"], dtype=cat)
ser < "L"

0    False
1    False
2     True
3    False
4     True
5     True
dtype: bool

In [23]:
print(pd.Series(["foo", "bar", "baz"] * 100, dtype=pd.StringDtype()).memory_usage())
print("--------------")
print(pd.Series(["foo", "bar", "baz"] * 100, dtype=cat).memory_usage())

2532
--------------
556


### Temporal types - datetime

In [24]:
ser = pd.Series([
    "2024-01-01 00:00:00",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:02"
], dtype="datetime64[ns]")
ser

0   2024-01-01 00:00:00
1   2024-01-02 00:00:01
2   2024-01-03 00:00:02
dtype: datetime64[ns]

In [27]:
ser.iloc[1] = "2024-01-04 00:00:42"
print(ser)
print("--------------")
print(ser.dt.year) # year, minute, second, hour, day, month, week, day_of_year, day_of_week

0   2024-01-01 00:00:00
1   2024-01-04 00:00:42
2   2024-01-03 00:00:02
dtype: datetime64[ns]
--------------
0    2024
1    2024
2    2024
dtype: int32


In [29]:
pd.Series([
    "2024-01-01 00:00:01",
    "2024-01-02 00:00:01",
    "2024-01-03 00:00:01"
], dtype=pd.DatetimeTZDtype(tz="UTC")) # UTC timezone, tz="America/New_York"

0   2024-01-01 00:00:01+00:00
1   2024-01-02 00:00:01+00:00
2   2024-01-03 00:00:01+00:00
dtype: datetime64[ns, UTC]

In [34]:
pd.isna(ser)

0    False
1     True
2    False
dtype: bool

### Temporal types - Timedelta

In [38]:
ser = pd.Series([
    "2024-01-01",
    "2024-01-02",
    "2024-01-03"
], dtype="datetime64[ns]")
ser - pd.Timestamp("2023-12-31 12:00:00")

0   0 days 12:00:00
1   1 days 12:00:00
2   2 days 12:00:00
dtype: timedelta64[ns]

In [37]:
ser + pd.Timedelta("3 days")

0   2024-01-04
1   2024-01-05
2   2024-01-06
dtype: datetime64[ns]

In [36]:
pd.Series([
    "-1 days",
    "6 hours",
    "42 minutes",
    "12 seconds",
    "8 milliseconds",
    "4 microseconds",
    "300 nanoseconds",
], dtype="timedelta64[ns]")

0           -1 days +00:00:00
1             0 days 06:00:00
2             0 days 00:42:00
3             0 days 00:00:12
4      0 days 00:00:00.008000
5      0 days 00:00:00.000004
6   0 days 00:00:00.000000300
dtype: timedelta64[ns]

### NumPy type system, the object type, and pitfalls

In [42]:
ser = pd.Series([0, None, 2])
ser.fillna(0).astype(int)

0    0
1    0
2    2
dtype: int64

In [43]:
print(pd.Series([0, None, 2]).fillna(0).astype(int).mean())
print("--------------")
print(pd.Series([0, None, 2], dtype=pd.Int64Dtype()).mean())

0.6666666666666666
--------------
1.0


In [44]:
import timeit
func = lambda: pd.Series([0, None, 2]).fillna(0).astype(int).mean()
timeit.timeit(func, number=10_000)

3.6394585000234656

In [45]:
func = lambda: pd.Series([0, None, 2], dtype=pd.Int64Dtype()).mean()
timeit.timeit(func, number=10_000)

2.2208314000163227

In [46]:
ser = pd.Series(["foo", "bar", "baz"])
ser.iloc[2] = 42
ser

0    foo
1    bar
2     42
dtype: object

In [47]:
alist = [42, "foo", ["sub", "list"], {"key": "value"}]
ser = pd.Series(alist)
ser

0                  42
1                 foo
2         [sub, list]
3    {'key': 'value'}
dtype: object