In [1]:
import numpy as np
import pandas as pd

## Eficient Memory use in Python

##### Create our data

In [25]:
def get_dataset(size):
    df = pd.DataFrame()
    df["position"] = np.random.choice(['left', 'middle', 'right'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red', 'blue', 'yellow', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no'], size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [26]:
df = get_dataset(1_000_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   position  1000000 non-null  object 
 1   age       1000000 non-null  int32  
 2   team      1000000 non-null  object 
 3   win       1000000 non-null  object 
 4   prob      1000000 non-null  float64
dtypes: float64(1), int32(1), object(3)
memory usage: 34.3+ MB


In [31]:
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

1.19 s ± 235 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.05 s ± 10.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.18 s ± 6.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
df.head()

Unnamed: 0,position,age,team,win,prob,age_rank,prob_rank
0,left,42,yellow,no,0.671124,70281.5,55904.0
1,left,44,blue,no,0.457396,74479.0,38342.0
2,left,24,red,no,0.996856,40035.0,83267.0
3,left,35,yellow,no,0.628515,58436.5,52346.0
4,middle,9,yellow,yes,0.742326,14404.5,62056.0


#### Making it more efficient by changing the data type of positions from string to categorical

In [36]:
df = get_dataset(1_000_000)
df['position'] = df['position'].astype('category')
df['team'] = df['team'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int32   
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int32(1), object(1)
memory usage: 21.0+ MB


In [38]:
df['age'].max()

49

### Downcasting Ints 

In [41]:
df['age'] = df['age'].astype('int8')

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int8(1), object(1)
memory usage: 18.1+ MB


In [53]:
df['prob'] = df['prob'].astype("float32")

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float32 
dtypes: category(2), float32(1), int8(1), object(1)
memory usage: 14.3+ MB


### Casting bool (true/false)

In [55]:
df['win'] = df['win'].map({'yes': True, 'no': False})

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  bool    
 4   prob      1000000 non-null  float32 
dtypes: bool(1), category(2), float32(1), int8(1)
memory usage: 7.6 MB


In [60]:
def set_dtypes(df):
    df['position'] = df['position'].astype('category')
    df['team'] = df['team'].astype('category')
    df['prob'] = df['prob'].astype("float32")
    df['win'] = df['win'].map({'yes': True, 'no': False})
    return df;

In [58]:
df = get_dataset(1_000_000)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()
df.info()

939 ms ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.09 s ± 192 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.09 s ± 40.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   position       1000000 non-null  object 
 1   age            1000000 non-null  int32  
 2   team           1000000 non-null  object 
 3   win            1000000 non-null  object 
 4   prob           1000000 non-null  float64
 5   age_rank       1000000 non-null  float64
 6   prob_rank      1000000 non-null  float64
 7   win_prob_rank  1000000 non-null  float64
dtypes: float64(4), int32(1), object(3)
memory usage: 57.2+ MB


In [61]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()
df.info()

761 ms ± 6.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
789 ms ± 8.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
926 ms ± 75.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype   
---  ------         --------------    -----   
 0   position       1000000 non-null  category
 1   age            1000000 non-null  int32   
 2   team           1000000 non-null  category
 3   win            1000000 non-null  bool    
 4   prob           1000000 non-null  float32 
 5   age_rank       1000000 non-null  float64 
 6   prob_rank      1000000 non-null  float64 
 7   win_prob_rank  1000000 non-null  float64 
dtypes: bool(1), category(2), float32(1), float64(3), int32(1)
memory usage: 33.4 MB


## Larger Data

In [63]:
df = get_dataset(10_000_000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()
df.info()

12.3 s ± 126 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
12.5 s ± 40.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
13.5 s ± 185 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 8 columns):
 #   Column         Dtype   
---  ------         -----   
 0   position       category
 1   age            int32   
 2   team           category
 3   win            bool    
 4   prob           float32 
 5   age_rank       float64 
 6   prob_rank      float64 
 7   win_prob_rank  float64 
dtypes: bool(1), category(2), float32(1), float64(3), int32(1)
memory usage: 333.8 MB


In [None]:
df = get_dataset(10_000_000)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()
df.info()