# Speed up your Pandas Code!


In [1]:
import numpy as np
import pandas as pd

# Create our dataset.


In [17]:
def get_data(size=10_000):
    df = pd.DataFrame()

    df["age"] = np.random.randint(0, 100, size)
    df["time_in_bed"] = np.random.randint(0, 9, size)
    df["pct_sleeping"] = np.random.rand(size)
    df["favorite_food"] = np.random.choice(["pizza", "taco", "ice-cream"], size)
    df["hate_food"] = np.random.choice(["broccoli", "candy corn", "eggs"], size)

    return df

# The problem.

Reward calculation:

- If they in bed for more than 5 hours AND they were sleeping for more than 50% we give them their favorite food.
- Otherwise we give them their hate food.
- If they are over 90 years old give their favorite food regardless.


In [20]:
def reward_cal(row):
    if row["age"] >= 90:
        return row["favorite_food"]
    if (row["time_in_bed"] > 5) & (row["pct_sleeping"] > 0.5):
        return row["favorite_food"]
    return row["hate_food"]

# Level 1 - Loop


In [25]:
df = get_data()

In [26]:
%%timeit
for index, row in df.iterrows():
    df.loc[index,"reward"] = reward_cal(row)

502 ms ± 8.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Level 2 - Apply


In [27]:
df = get_data()

In [29]:
%%timeit
df["reward"] = df.apply(reward_cal, axis=1)

78.7 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Level 3 - Vectorized


In [None]:
df = get_data()

In [34]:
%%timeit

df["reward"] = df["hate_food"]

old_man_condition = df["age"] > 90
young_man_condition = (df["pct_sleeping"] > 0.5) & df["time_in_bed"] > 5

df.loc[young_man_condition | old_man_condition, "reward"] = df["favorite_food"]

554 µs ± 19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Extra

In [None]:
df = get_data()

In [35]:
%%timeit

df["reward"] = df["hate_food"]

old_man_condition = df["age"] > 90
young_man_condition = (df["pct_sleeping"] > 0.5) & df["time_in_bed"] > 5

df[young_man_condition | old_man_condition]["reward"] = df["favorite_food"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


6.25 ms ± 214 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
