# Exercise 05 : Pandas optimizations

## Imports

In [1]:
import pandas as pd
import gc

## Read CSV file

In [2]:
df = pd.read_csv('../data/fines.csv')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
925,SCHOOL21111RUS,2,2400.0,Ford,Focus,2000
926,SCHOOL21222RUS,1,2000.0,Volkswagen,Focus,1996
927,SCHOOL21333RUS,2,1100.0,Ford,Focus,2013
928,SCHOOL21444RUS,1,1600.0,Ford,Focus,1992


## Iterations: loop

In [3]:
def iterations_loop(df_loop):
    result = []
    for i in range(0, len(df_loop)):
        result.append(df_loop.iloc[i]['Fines'] / df_loop.iloc[i]['Refund'] * df_loop.iloc[i]['Year'])
    df_loop['Calculated'] = result

In [4]:
%%timeit

df['Sum1'] = iterations_loop(df)

211 ms ± 7.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Iterations: *iterrows()*

In [5]:
def iterations_iterrows(df_rows):
    result = []
    for row in df_rows.iterrows():
        result.append(row[1]['Fines'] / row[1]['Refund'] * row[1]['Year'])
    df_rows['Calculated'] = result

In [6]:
%%timeit

df['Sum2'] = iterations_iterrows(df)

39.6 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Iterations: *apply()*

In [7]:
def iterations_apply(df_apply):
    df_apply['Calculated'] = df_apply.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis='columns')

In [8]:
%%timeit

df['Sum3'] = iterations_apply(df)

9.97 ms ± 69.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Iterations: Series

In [9]:
def iterations_series(df_series):
    df_series['Calculated'] = df_series['Fines'] / df_series['Refund'] * df_series['Year']

In [10]:
%%timeit

df['Sum4'] = iterations_series(df)

326 µs ± 1.27 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Iterations: Series and values

In [11]:
def iterations_series_values(df_series_values):
    df_series_values['Calculated'] = df_series_values['Fines'].values / df_series_values['Refund'].values * df_series_values['Year'].values

In [12]:
%%timeit

df['Sum5'] = iterations_series_values(df)

156 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Index

In [13]:
%%timeit

df[df['CarNumber'] == 'O136HO197RUS']

300 µs ± 2.94 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [14]:
df.set_index('CarNumber', inplace=True)

In [15]:
%%timeit

df.loc['O136HO197RUS']

138 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Downcasting

In [16]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to SCHOOL21555RUS
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Refund      930 non-null    int64  
 1   Fines       930 non-null    float64
 2   Make        930 non-null    object 
 3   Model       919 non-null    object 
 4   Year        930 non-null    int64  
 5   Calculated  930 non-null    float64
 6   Sum1        0 non-null      object 
 7   Sum2        0 non-null      object 
 8   Sum3        0 non-null      object 
 9   Sum4        0 non-null      object 
 10  Sum5        0 non-null      object 
dtypes: float64(2), int64(2), object(7)
memory usage: 345.0 KB


In [17]:
copy = df.copy()

In [18]:
fcols = copy.select_dtypes('float').columns
icols = copy.select_dtypes('integer').columns

copy[fcols] = copy[fcols].apply(pd.to_numeric, downcast='float')
copy[icols] = copy[icols].apply(pd.to_numeric, downcast='integer')

In [19]:
copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to SCHOOL21555RUS
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Refund      930 non-null    int8   
 1   Fines       930 non-null    float32
 2   Make        930 non-null    object 
 3   Model       919 non-null    object 
 4   Year        930 non-null    int16  
 5   Calculated  930 non-null    float64
 6   Sum1        0 non-null      object 
 7   Sum2        0 non-null      object 
 8   Sum3        0 non-null      object 
 9   Sum4        0 non-null      object 
 10  Sum5        0 non-null      object 
dtypes: float32(1), float64(1), int16(1), int8(1), object(7)
memory usage: 329.5 KB


## Categories

In [20]:
df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [21]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to SCHOOL21555RUS
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Refund      930 non-null    int64   
 1   Fines       930 non-null    float64 
 2   Make        930 non-null    category
 3   Model       919 non-null    category
 4   Year        930 non-null    int64   
 5   Calculated  930 non-null    float64 
 6   Sum1        0 non-null      category
 7   Sum2        0 non-null      category
 8   Sum3        0 non-null      category
 9   Sum4        0 non-null      category
 10  Sum5        0 non-null      category
dtypes: category(7), float64(2), int64(2)
memory usage: 132.3 KB


##  Memory clean

In [22]:
%reset_selective df

gc.collect()

7