# Efficient Memory use in Pandas


In [27]:
import numpy as np
import pandas as pd

# Create our Data


In [28]:
def get_data(size=1_000_000):
    df = pd.DataFrame()

    df["position"] = np.random.choice(["left", "middel", "right"], size)
    df["age"] = np.random.randint(1, 50, size)
    df["team"] = np.random.choice(["rad", "blue", "yellow", "green"], size)
    df["win"] = np.random.choice(["yes", "no"], size)
    df["prob"] = np.random.uniform(0, 1, size)

    return df

In [29]:
df = get_data()

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   position  1000000 non-null  object 
 1   age       1000000 non-null  int32  
 2   team      1000000 non-null  object 
 3   win       1000000 non-null  object 
 4   prob      1000000 non-null  float64
dtypes: float64(1), int32(1), object(3)
memory usage: 34.3+ MB


In [31]:
df = get_data()

In [32]:
%timeit df["age_rank"] = df.groupby(["team", "position"])["age"].rank()
%timeit df["age_rank"] = df.groupby(["team", "position"])["prob"].rank()
%timeit df["wim_prob_rank"] = df.groupby(["team", "position","win"])["prob"].rank()

209 ms ± 13.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
263 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
304 ms ± 14.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
def set_dtype(df):
    df["position"] = df["position"].astype("category")
    df["team"] = df["team"].astype("category")
    df["age"] = df["age"].astype("int8")
    df["prob"] = df["prob"].astype("float16")
    df["win"] = df["win"].map({"yes": True, "no": False}).astype("bool")

    return df

In [58]:
df = get_data()
%timeit set_dtype(df)
df = set_dtype(df)

74.7 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  bool    
 4   prob      1000000 non-null  float16 
dtypes: bool(1), category(2), float16(1), int8(1)
memory usage: 5.7 MB


In [60]:
%timeit df["age_rank"] = df.groupby(["team", "position"])["age"].rank()
%timeit df["age_rank"] = df.groupby(["team", "position"])["prob"].rank()
%timeit df["wim_prob_rank"] = df.groupby(["team", "position","win"])["prob"].rank()

139 ms ± 5.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
178 ms ± 5.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
184 ms ± 4.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Extra

In [61]:
%timeit df["win_temp1"] = df["win"].map({"yes": True, "no": False}).astype("bool")
%timeit df["win_temp2"] = df["win"].apply(lambda x: x == "yes").astype("bool")
%timeit df["win_temp3"] = pd.Series([i == "yes" for i in df["win"]], dtype="bool")

72.4 ms ± 749 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
58.1 ms ± 173 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
67.3 ms ± 555 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [62]:
%%timeit 
df["win_temp4"] = False
df.loc[df["win"] == "yes", "win_temp4"] = True

184 µs ± 4.3 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
 #   Column         Non-Null Count    Dtype   
---  ------         --------------    -----   
 0   position       1000000 non-null  category
 1   age            1000000 non-null  int8    
 2   team           1000000 non-null  category
 3   win            1000000 non-null  bool    
 4   prob           1000000 non-null  float16 
 5   age_rank       1000000 non-null  float64 
 6   wim_prob_rank  1000000 non-null  float64 
 7   win_temp1      1000000 non-null  bool    
 8   win_temp2      1000000 non-null  bool    
 9   win_temp3      1000000 non-null  bool    
 10  win_temp4      1000000 non-null  bool    
dtypes: bool(5), category(2), float16(1), float64(2), int8(1)
memory usage: 24.8 MB
