# Exercise 04 : Enrichment and transformations
## Required data

In [1]:
%ls ../data/fines.csv

../data/fines.csv


## Imports

In [2]:
import pandas as pd
import gc

## Read the data

In [3]:
df = pd.read_csv('../data/fines.csv')

df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
925,X008RUS,2,5224.2,Toyota,Camry,2014
926,1111RUS,1,14.2,Toyota,Camry,2014
927,5AAA9RUS,3,512524.2,Toyota,Camry,2014
928,566RUS,2,124.2,Toyota,Camry,2014


## Iterations
### Calculate `fines / refund * year` for each columns with various methods:
#### 1. Loop

In [4]:
def iterations_loop_test(df):
    result = []
    for i in range(0, len(df)):
        result.append(df.iloc[i]['Fines'] / df.iloc[i]['Refund'] * df.iloc[i]['Year'])
    df['Calculated'] = result

In [5]:
%%timeit

iterations_loop_test(df)
df

334 ms ± 6.56 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### 2. `iterrows()`

In [6]:
def iterations_iterrows_test(df):
    result = []
    for row in df.iterrows():
        result.append(row[1]['Fines'] / row[1]['Refund'] * row[1]['Year'])
    df['Calculated'] = result

In [7]:
%%timeit

iterations_iterrows_test(df)
df

54 ms ± 3.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### 3. `apply()`

In [8]:
def iterations_apply_test(df):
    df['Calculated'] = df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis='columns')

In [9]:
%%timeit

iterations_apply_test(df)
df

15.7 ms ± 363 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### 4. using `Series` objects from the `df`

In [10]:
def iterations_series_test(df):
    df['Calculated'] = df['Fines'] / df['Refund'] * df['Year']

In [11]:
%%timeit

iterations_series_test(df)
df

362 µs ± 4.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


#### 4. using `Series` objects from the `df` + method `.values`

In [12]:
def iterations_series_values_test(df):
    df['Calculated'] = df['Fines'].values / df['Refund'].values * df['Year'].values

In [13]:
%%timeit

iterations_series_values_test(df)
df

179 µs ± 19.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Indexing
### Get a row for a specific CarNumber, for example, `O136HO197RUS`

In [14]:
%%timeit

df[df['CarNumber'] == 'O136HO197RUS']

389 µs ± 26.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
df.set_index('CarNumber', inplace=True)

In [16]:
%%timeit

df.loc['O136HO197RUS']

144 µs ± 5.73 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Downcasting

In [17]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to A9999RUS
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Refund      930 non-null    int64  
 1   Fines       930 non-null    float64
 2   Make        930 non-null    object 
 3   Model       919 non-null    object 
 4   Year        930 non-null    int64  
 5   Calculated  930 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 236.0 KB


In [18]:
copy = df.copy()

In [19]:
fcols = copy.select_dtypes('float').columns
icols = copy.select_dtypes('integer').columns

copy[fcols] = copy[fcols].apply(pd.to_numeric, downcast='float')
copy[icols] = copy[icols].apply(pd.to_numeric, downcast='integer')

In [20]:
copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to A9999RUS
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Refund      930 non-null    int8   
 1   Fines       930 non-null    float32
 2   Make        930 non-null    object 
 3   Model       919 non-null    object 
 4   Year        930 non-null    int16  
 5   Calculated  930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(2)
memory usage: 216.9 KB


## Categories

In [21]:
df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [22]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to A9999RUS
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Refund      930 non-null    int64   
 1   Fines       930 non-null    float64 
 2   Make        930 non-null    category
 3   Model       919 non-null    category
 4   Year        930 non-null    int64   
 5   Calculated  930 non-null    float64 
dtypes: category(2), float64(2), int64(2)
memory usage: 127.2 KB


## Memory clean

In [23]:
%reset_selective df

gc.collect()

Nothing done.


129