In [90]:
import pandas as pd
import gc

## Прочитайте файл fines.csv, который вы сохранили в предыдущем упражнении

In [91]:
df = pd.read_csv('../data/fines.csv')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,2,75900.0,Ford,Focus,1996
921,X796TH96RUS,1,3100.0,Ford,Focus,2002
922,T011MY163RUS,1,69600.0,Ford,Focus,1996
923,T341CC96RUS,1,27800.0,Volkswagen,Passat,2012


## Итерации: во всех следующих подзадачах нужно посчитать `Fines/Refund*Year` для каждой строки и создать новый столбец с рассчитанными данными и измерить время с помощью волшебной команды `%%timeit` в ячейке

- Loop

In [92]:
def loop(df):
    res = []
    for i in range(0,len(df)):
        res.append(df.iloc[i]['Fines'] / df.iloc[i]['Refund'] * df.iloc[i]['Year'])
    df['Calculated'] = res

In [93]:
%%timeit
loop(df)
df

283 ms ± 6.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


- Iterrows

In [94]:
def iter(df):
    res = []
    for _, row in df.iterrows():
        res.append(row['Fines'] / row['Refund'] * row['Year'])
    df['Calculated'] = res

In [95]:
%%timeit
iter(df)
df

84.1 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


- apply и lambda-функция

In [96]:
def apply_lambda(df):
    df['Calculated'] = df.apply(lambda x: x['Fines'] / x['Refund'] * x['Year'], axis=1)

In [97]:
%%timeit
apply_lambda(df)
df

15.3 ms ± 135 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


- Series

In [98]:
def series(df):
    df['Calculated'] = df['Fines'] / df['Refund'] * df['Year']

In [99]:
%%timeit
series(df)
df

412 µs ± 12.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


- series + метод `values()`

In [100]:
def series_values(df):
    df['Calculated'] = df['Fines'].values / df['Refund'].values * df['Year'].values

In [101]:
%%timeit
series_values(df)
df

179 µs ± 2.77 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Индексирование: измерьте время с помощью волшебной команды `%%timeit` в ячейке

- Получим строки соотвествующие номеру машины O136HO197RUS


In [102]:
%%timeit
df.loc[df['CarNumber'] == 'O136HO197RUS']

452 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


- установите индекс в вашем фрейме данных с помощью CarNumber
- снова получите строку для того же CarNumber

In [103]:
df.set_index('CarNumber', inplace=True)
df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculated
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989,3182400.0
E432XX77RUS,1,6500.0,Toyota,Camry,1995,12967500.0
7184TT36RUS,1,2100.0,Ford,Focus,1984,4166400.0
X582HE161RUS,2,2000.0,Ford,Focus,2015,2015000.0
92918M178RUS,1,5700.0,Ford,Focus,2014,11479800.0
...,...,...,...,...,...,...
8182XX154RUS,2,75900.0,Ford,Focus,1996,75748200.0
X796TH96RUS,1,3100.0,Ford,Focus,2002,6206200.0
T011MY163RUS,1,69600.0,Ford,Focus,1996,138921600.0
T341CC96RUS,1,27800.0,Volkswagen,Passat,2012,55933600.0


In [104]:
%%timeit
df.loc[df.index == 'O136HO197RUS']

379 µs ± 13 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Downcasting

In [105]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 925 entries, Y163O8161RUS to T119CT96RUS
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Refund      925 non-null    int64  
 1   Fines       925 non-null    float64
 2   Make        925 non-null    object 
 3   Model       914 non-null    object 
 4   Year        925 non-null    int64  
 5   Calculated  925 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 202.6 KB


In [106]:
# Создадим копию исхоного датафрейма
df_copy = df.copy()
df_copy

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculated
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989,3182400.0
E432XX77RUS,1,6500.0,Toyota,Camry,1995,12967500.0
7184TT36RUS,1,2100.0,Ford,Focus,1984,4166400.0
X582HE161RUS,2,2000.0,Ford,Focus,2015,2015000.0
92918M178RUS,1,5700.0,Ford,Focus,2014,11479800.0
...,...,...,...,...,...,...
8182XX154RUS,2,75900.0,Ford,Focus,1996,75748200.0
X796TH96RUS,1,3100.0,Ford,Focus,2002,6206200.0
T011MY163RUS,1,69600.0,Ford,Focus,1996,138921600.0
T341CC96RUS,1,27800.0,Volkswagen,Passat,2012,55933600.0


In [107]:
# Понизим разрядность чисел для снижения потребления памяти          исходное = 202.6 KB
df_copy['Refund'] = pd.to_numeric(df_copy['Refund'], downcast='unsigned') # 196.3 KB
df_copy['Year'] = pd.to_numeric(df_copy['Year'], downcast='unsigned')     # 190.9 KB
df_copy['Fines'] = pd.to_numeric(df_copy['Fines'], downcast='float')     # 187.2 KB
df_copy['Calculated'] = pd.to_numeric(df_copy['Calculated'], downcast='float')     # 187.2 KB


# objects переведем в category
df_copy['Make'] = df_copy['Make'].astype('category')        # 133.2 KB
df_copy['Model'] = df_copy['Model'].astype('category')      # 79.1 KB
df_copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 925 entries, Y163O8161RUS to T119CT96RUS
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Refund      925 non-null    uint8   
 1   Fines       925 non-null    float32 
 2   Make        925 non-null    category
 3   Model       914 non-null    category
 4   Year        925 non-null    uint16  
 5   Calculated  925 non-null    float64 
dtypes: category(2), float32(1), float64(1), uint16(1), uint8(1)
memory usage: 79.1 KB
