## General Usage and Performance Tips

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

### Avoid dtype=object

In [2]:
ser_obj = pd.Series(["foo", "bar", "baz"] * 10_000, dtype=object)
ser_str = pd.Series(["foo", "bar", "baz"] * 10_000, dtype=pd.StringDtype())

In [None]:
# ser_str.iloc[0] = False  #give error

ser_obj.iloc[0] = False

In [5]:
ser_obj.str.capitalize().head()

0    NaN
1    Bar
2    Baz
3    Foo
4    Bar
dtype: object

### Be cognizant of data sizes

In [6]:
df = pd.DataFrame({
    "a": [0] * 100_000,
    "b": [2 ** 8] * 100_000,
    "c": [2 ** 16] * 100_000,
    "d": [2 ** 32] * 100_000,
})
df = df.convert_dtypes(dtype_backend="numpy_nullable")

df.head()

Unnamed: 0,a,b,c,d
0,0,256,65536,4294967296
1,0,256,65536,4294967296
2,0,256,65536,4294967296
3,0,256,65536,4294967296
4,0,256,65536,4294967296


In [7]:
df.memory_usage()

Index       132
a        900000
b        900000
c        900000
d        900000
dtype: int64

In [8]:
df.assign(
    a=lambda x: x["a"].astype(pd.Int8Dtype()),
    b=lambda x: x["b"].astype(pd.Int16Dtype()),
    c=lambda x: x["c"].astype(pd.Int32Dtype()),
).memory_usage()

Index       132
a        200000
b        300000
c        500000
d        900000
dtype: int64

In [9]:
df.select_dtypes("number").assign(
    **{x: pd.to_numeric(
         y, downcast="signed", dtype_backend="numpy_nullable"
    ) for x, y in df.items()}
).memory_usage()

Index       132
a        200000
b        300000
c        500000
d        900000
dtype: int64

### Use vectorized functions instead of loops

In [11]:
ser = pd.Series(range(100_000), dtype=pd.Int64Dtype())
print(ser.sum())

4999950000


In [None]:
result = 0
for x in ser:
    result += x

print(result)

print(ser.sum())

4999950000


In [16]:
import datetime 
import timeit
timeit.timeit(ser.sum, number=1000)

0.16033769998466596

In [17]:
def loop_sum():
    result = 0
    for x in ser:
        result += x

timeit.timeit(loop_sum, number=1000)

19.834238899988122

In [18]:
df = pd.DataFrame({
    "column": ["a", "a", "b", "a", "b"],
    "value": [0, 1, 2, 4, 8],
})
df = df.convert_dtypes(dtype_backend="numpy_nullable")

for label, group in df.groupby("column"):
    print(f"The group for label {label} is:\n{group}\n")

The group for label a is:
  column  value
0      a      0
1      a      1
3      a      4

The group for label b is:
  column  value
2      b      2
4      b      8



### Avoid mutating data

In [19]:
def mutate_after():
    data = ["foo", "bar", "baz"]
    ser = pd.Series(data, dtype=pd.StringDtype())
    ser.iloc[1] = "BAR"

timeit.timeit(mutate_after, number=1000)

0.13156370000797324

In [20]:
def mutate_before():
    data = ["foo", "bar", "baz"]
    data[1] = "BAR"
    ser = pd.Series(data, dtype=pd.StringDtype())

timeit.timeit(mutate_before, number=1000)

0.06508870000834577

### Dictionary-encode low cardinality data

In [21]:
values = ["foo", "bar", "baz"]
ser = pd.Series(values * 100_000, dtype=pd.StringDtype())
ser.memory_usage()

2400132

In [22]:
cat = pd.CategoricalDtype(values)
ser = pd.Series(values * 100_000, dtype=cat)
ser.memory_usage()

300264

### Test-driven development features

In [23]:
import unittest

class MyTests(unittest.TestCase):

    def test_42(self):
        self.assertEqual(21 * 2, 42)

def suite():
    suite = unittest.TestSuite()
    suite.addTest(MyTests("test_42"))
    return suite

runner = unittest.TextTestRunner()
runner.run(suite())

.
----------------------------------------------------------------------
Ran 1 test in 0.001s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>

In [24]:
def some_cool_numbers():
    return pd.Series([42, 555, pd.NA], dtype=pd.Int64Dtype())

class MyTests(unittest.TestCase):

    def test_cool_numbers(self):
        result = some_cool_numbers()
        expected = pd.Series([42, 555, pd.NA], dtype=pd.Int64Dtype())
        self.assertEqual(result, expected)

def suite():
    suite = unittest.TestSuite()
    suite.addTest(MyTests("test_cool_numbers"))
    return suite

runner = unittest.TextTestRunner()
runner.run(suite())

E
ERROR: test_cool_numbers (__main__.MyTests.test_cool_numbers)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\Kamran\AppData\Local\Temp\ipykernel_8500\2361126771.py", line 9, in test_cool_numbers
    self.assertEqual(result, expected)
    ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "c:\Users\Kamran\miniconda3\envs\data_analysis\Lib\unittest\case.py", line 907, in assertEqual
    assertion_func(first, second, msg=msg)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Kamran\miniconda3\envs\data_analysis\Lib\unittest\case.py", line 897, in _baseAssertEqual
    if not first == second:
           ^^^^^^^^^^^^^^^
  File "c:\Users\Kamran\miniconda3\envs\data_analysis\Lib\site-packages\pandas\core\generic.py", line 1577, in __nonzero__
    raise ValueError(
    ...<2 lines>...
    )
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

--------------------

<unittest.runner.TextTestResult run=1 errors=1 failures=0>

In [25]:
result = some_cool_numbers()
expected = pd.Series([42, 555, pd.NA], dtype=pd.Int64Dtype())
result == expected

0    True
1    True
2    <NA>
dtype: boolean

In [26]:
import pandas.testing as tm

def some_cool_numbers():
    return pd.Series([42, 555, pd.NA], dtype=pd.Int64Dtype())

class MyTests(unittest.TestCase):

    def test_cool_numbers(self):
        result = some_cool_numbers()
        expected = pd.Series([42, 555, pd.NA], dtype=pd.Int64Dtype())
        tm.assert_series_equal(result, expected)

def suite():
    suite = unittest.TestSuite()
    suite.addTest(MyTests("test_cool_numbers"))
    return suite

runner = unittest.TextTestRunner()
runner.run(suite())

.
----------------------------------------------------------------------
Ran 1 test in 0.004s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>