In [119]:
import pandas as pd
import gc

## read the fines.csv that you saved it the previous exercise

In [105]:
df = pd.read_csv('../data/fines.csv')

In [106]:
df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,year
0,Y163O8161RUS,2,3200.0,Ford,Focus,2007
1,E432XX77RUS,1,6500.0,Toyota,Camry,1987
2,7184TT36RUS,1,2100.0,Ford,Focus,1991
3,X582HE161RUS,2,2000.0,Ford,Focus,2012
4,92918M178RUS,1,5700.0,Ford,Focus,2014


## iterations: in all the following subtasks you need to calculate fines/refund*year for each row and create a new column with the calculated data and measure the time using the magic command %%timeit in the cell

loop: write a function that iterates through the dataframe using for
i in range(0, len(df)), iloc and append() to a list, assign the result of the function to
a new column in the dataframe

In [66]:
%%timeit
def calc_val(df):
    col = list()
    for i in range(0, len(df)):
        row = df.iloc[i]
        col.append(row['Fines'] / row['Refund'] * row['year'])
    return col
calc_val(df)

535 ms ± 45.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


do it using iterrows()

In [67]:
%%timeit
col = list()
for row in df.iterrows():
    col.append(row[1]['Fines'] / row[1]['Refund'] * row[1]['year'])

345 ms ± 43.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


do it using apply() and lambda function

In [68]:
%%timeit
df.apply(lambda row: row['Fines'] / row['Refund'] * row['year'], axis=1)

124 ms ± 2.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [69]:
%%timeit
df['Fines'] / df['Refund'] * df['year']

1.12 ms ± 29.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [70]:
%%timeit
df['Fines'].values / df['Refund'].values * df['year'].values

58.9 µs ± 134 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## indexing: measure the time using the magic command %%timeit in the cell

In [71]:
random_num = df['CarNumber'].iloc[-42]

In [73]:
%%timeit
# get a row for a specific CarNumber, for example, 'O20197197RUS'
df.query('CarNumber == @random_num')

6.63 ms ± 69.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [74]:
# set the index in your dataframe with CarNumber
df = df.set_index('CarNumber')

In [82]:
%%timeit
# again, get a row for the same CarNumber
df.loc[random_num]

4.66 ms ± 71.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## downcasting

In [107]:
del df

In [108]:
df = pd.read_csv('../data/fines.csv')

In [109]:
# run df.info(memory_usage='deep'), pay attention to the Dtype and the memory usage
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    int64  
 2   Fines      930 non-null    float64
 3   Make       930 non-null    object 
 4   Model      918 non-null    object 
 5   year       930 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 196.5 KB


In [110]:
# make a copy() of your initial dataframe into another dataframe optimized
df2 = df.copy()

In [112]:
# downcast from float64 to float32 for all the columns
for col in df2:
    if df2[col].dtype == 'float64':
        df2[col] = pd.to_numeric(df2[col], downcast='float')
    elif df2[col].dtype == 'int64':
        df2[col] = pd.to_numeric(df2[col], downcast='integer')

In [113]:
# downcast from int64 to the smallest numerical dtype possible
for col in df2:
    if df2[col].dtype == 'float64':
        df2 = df2[col].astype('float32')

In [114]:
# run info(memory_usage='deep') for your new dataframe, pay attention to the
# Dtype and the memory usage
df2.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    int8   
 2   Fines      930 non-null    float32
 3   Make       930 non-null    object 
 4   Model      918 non-null    object 
 5   year       930 non-null    int16  
dtypes: float32(1), int16(1), int8(1), object(3)
memory usage: 181.1 KB


## Categories

In [115]:
# change the object type columns to the type category
for col in df2:
    if df2[col].dtype == 'object':
        df2[col] = df2[col].astype('category')

In [116]:
# check the memory usage this time, it probably has a decrease 2-3 times
# comparing to the initial dataframe
df2.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CarNumber  930 non-null    category
 1   Refund     930 non-null    int8    
 2   Fines      930 non-null    float32 
 3   Make       930 non-null    category
 4   Model      918 non-null    category
 5   year       930 non-null    int16   
dtypes: category(3), float32(1), int16(1), int8(1)
memory usage: 67.7 KB


## Memory clean

In [117]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [120]:
gc.collect()

97