In [24]:
import pandas as pd
import gc

In [47]:
df = pd.read_csv('../../datasets/fines.csv')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
925,A111AA111RUS,1.0,4500.0,Tesla,Model 3,2010
926,B222BB222RUS,2.0,3200.0,Ford,Bronco,2015
927,C333CC333RUS,1.0,2800.0,Volkswagen,Terramont,2018
928,D444DD444RUS,2.0,5100.0,Lexus,LFA,2012


## Iterations

In [26]:
%%timeit
result = []
for i in range(0, len(df)):
    row = df.iloc[i]
    result.append(row['Fines']/ row['Refund'] * row['Year'])
df['Calculated'] = result

104 ms ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
%%timeit
result = []
for _, row in df.iterrows():
    result.append(row['Fines']/ row['Refund'] * row['Year'])
df['Calculated'] = result

78.5 ms ± 6.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
%%timeit
df['Calculated'] = df.apply(lambda row: (row['Fines']/ row['Refund'] * row['Year']),axis = 1)

23.8 ms ± 3.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
%%timeit
df['Calculated'] = df['Fines']/ df['Refund'] * df['Year']

369 μs ± 17.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [30]:
%%timeit
df['Calculated'] = (df['Fines']/ df['Refund'] * df['Year']).values

351 μs ± 13.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Indexing

In [31]:
%%timeit
row = df[df['CarNumber'] == 'O136HO197RUS']

1.37 ms ± 633 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [32]:
dfi = df.set_index('CarNumber')

In [33]:
%%timeit
row = dfi.loc['O136HO197RUS']

334 μs ± 95.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [34]:
dfi

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculated
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2.0,3200.0,Ford,Focus,1989,3182400.0
E432XX77RUS,1.0,6500.0,Toyota,Camry,1995,12967500.0
7184TT36RUS,1.0,2100.0,Ford,Focus,1984,4166400.0
X582HE161RUS,2.0,2000.0,Ford,Focus,2015,2015000.0
92918M178RUS,1.0,5700.0,Ford,Focus,2014,11479800.0
...,...,...,...,...,...,...
A111AA111RUS,1.0,4500.0,Tesla,Model 3,2010,9045000.0
B222BB222RUS,2.0,3200.0,Ford,Bronco,2015,3224000.0
C333CC333RUS,1.0,2800.0,Volkswagen,Terramont,2018,5650400.0
D444DD444RUS,2.0,5100.0,Lexus,LFA,2012,5130600.0


## Downcasting

In [35]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    float64
 2   Fines       930 non-null    float64
 3   Make        930 non-null    object 
 4   Model       918 non-null    object 
 5   Year        930 non-null    int64  
 6   Calculated  930 non-null    float64
dtypes: float64(3), int64(1), object(3)
memory usage: 182.1 KB


In [36]:
optimized = df.copy()

### Float downcast

In [38]:
float_cols = optimized.select_dtypes(include='float64').columns
for col in float_cols:
    optimized[col] = optimized[col].astype('float32')


### Int downcast

In [39]:
int_cols = optimized.select_dtypes(include='int64').columns
for col in int_cols:
    optimized[col] = pd.to_numeric(optimized[col], downcast = 'integer')

In [40]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    float32
 2   Fines       930 non-null    float32
 3   Make        930 non-null    object 
 4   Model       918 non-null    object 
 5   Year        930 non-null    int16  
 6   Calculated  930 non-null    float32
dtypes: float32(3), int16(1), object(3)
memory usage: 165.7 KB


### Categories

In [41]:
object_cols = optimized.select_dtypes(include='object').columns
for col in object_cols:
        optimized[col] = optimized[col].astype('category')

In [42]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   CarNumber   930 non-null    category
 1   Refund      930 non-null    float32 
 2   Fines       930 non-null    float32 
 3   Make        930 non-null    category
 4   Model       918 non-null    category
 5   Year        930 non-null    int16   
 6   Calculated  930 non-null    float32 
dtypes: category(3), float32(3), int16(1)
memory usage: 66.5 KB


In [43]:
%reset_selective -f df

In [44]:
df

NameError: name 'df' is not defined

In [48]:
del df
gc.collect()

1967

In [49]:
df

NameError: name 'df' is not defined