##### Load libs:

In [1]:
import pandas as pd
import gc

##### 1.Read data csv:

In [2]:
fines = pd.read_csv('../data/fines.csv')
print(fines)

        CarNumber  Refund    Fines        Make   Model  Year
0       ABC123RUS     1.0   2500.0      Toyota   Camry  2015
1       XYZ789RUS     2.0   1800.0       Honda   Civic  2018
2       DEF456RUS     1.0   3200.0        Ford   Focus  2016
3       GHI789RUS     2.0   1500.0      Nissan  Altima  2019
4       JKL012RUS     1.0   2800.0   Chevrolet  Malibu  2017
..            ...     ...      ...         ...     ...   ...
925  8182XX154RUS     1.0    500.0        Ford   Focus  1980
926   X796TH96RUS     2.0  40600.0        Ford   Focus  2001
927  T011MY163RUS     1.0   2300.0        Ford   Focus  2015
928   T341CC96RUS     2.0   7100.0  Volkswagen  Passat  2013
929   T119CT96RUS     2.0    200.0        Ford   Focus  1997

[930 rows x 6 columns]


##### 2.Iteration:

Use for and iloc:

In [3]:
%%timeit 
def method_iloc(df):
	result = []
	for i in range(len(df)):
		row = df.iloc[i]
		result.append(row['Fines'] / row['Refund'] * row['Year'] if row['Refund']!=0 else None)
	return result

fines['calc_1'] = method_iloc(fines)

27.6 ms ± 594 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Use iterrows:

In [4]:
%%timeit 
def method_iterrows(df):
    result = []
    for index, row in df.iterrows():
        result.append(row['Fines'] / row['Refund'] * row['Year'])
    return result

fines['calc_2'] = method_iterrows(fines)

23.9 ms ± 749 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Use apply:

In [5]:
%%timeit
fines['calc_3'] = fines.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

5.3 ms ± 69.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Serias object:

In [6]:
%%timeit
fines['calc_4'] = fines['Fines'] / fines['Refund'] * fines['Year']

138 μs ± 4.45 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


Use dot values:

In [7]:
%%timeit
fines['calc_5'] = fines['Fines'].values / fines['Refund'].values * fines['Year'].values

61 μs ± 1.18 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


CHECK:

In [8]:
print(fines)

        CarNumber  Refund    Fines        Make   Model  Year      calc_1  \
0       ABC123RUS     1.0   2500.0      Toyota   Camry  2015   5037500.0   
1       XYZ789RUS     2.0   1800.0       Honda   Civic  2018   1816200.0   
2       DEF456RUS     1.0   3200.0        Ford   Focus  2016   6451200.0   
3       GHI789RUS     2.0   1500.0      Nissan  Altima  2019   1514250.0   
4       JKL012RUS     1.0   2800.0   Chevrolet  Malibu  2017   5647600.0   
..            ...     ...      ...         ...     ...   ...         ...   
925  8182XX154RUS     1.0    500.0        Ford   Focus  1980    990000.0   
926   X796TH96RUS     2.0  40600.0        Ford   Focus  2001  40620300.0   
927  T011MY163RUS     1.0   2300.0        Ford   Focus  2015   4634500.0   
928   T341CC96RUS     2.0   7100.0  Volkswagen  Passat  2013   7146150.0   
929   T119CT96RUS     2.0    200.0        Ford   Focus  1997    199700.0   

         calc_2      calc_3      calc_4      calc_5  
0     5037500.0   5037500.0   503

##### 3.Indexing:

Not use index:

In [9]:
%timeit fines[fines['CarNumber']=='O136HO197RUS']
print(fines[fines['CarNumber']=='O136HO197RUS'])

221 μs ± 1.56 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
        CarNumber  Refund   Fines    Make    Model  Year     calc_1  \
720  O136HO197RUS     2.0  7800.0  Toyota  Corolla  2018  7870200.0   
907  O136HO197RUS     1.0   300.0  Toyota  Corolla  2001   600300.0   

        calc_2     calc_3     calc_4     calc_5  
720  7870200.0  7870200.0  7870200.0  7870200.0  
907   600300.0   600300.0   600300.0   600300.0  


Use index:

In [10]:
temp_fines = fines.copy().set_index('CarNumber')
%timeit temp_fines[temp_fines.index=='O136HO197RUS']
print(temp_fines[temp_fines.index=='O136HO197RUS'])

125 μs ± 720 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
              Refund   Fines    Make    Model  Year     calc_1     calc_2  \
CarNumber                                                                   
O136HO197RUS     2.0  7800.0  Toyota  Corolla  2018  7870200.0  7870200.0   
O136HO197RUS     1.0   300.0  Toyota  Corolla  2001   600300.0   600300.0   

                 calc_3     calc_4     calc_5  
CarNumber                                      
O136HO197RUS  7870200.0  7870200.0  7870200.0  
O136HO197RUS   600300.0   600300.0   600300.0  


##### 4.Downcasting:

Info:

In [11]:
fines.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    float64
 2   Fines      930 non-null    float64
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int64  
 6   calc_1     930 non-null    float64
 7   calc_2     930 non-null    float64
 8   calc_3     930 non-null    float64
 9   calc_4     930 non-null    float64
 10  calc_5     930 non-null    float64
dtypes: float64(7), int64(1), object(3)
memory usage: 211.2 KB


Copy:

In [12]:
optimized_df = fines.copy()

float64 to float32:

In [13]:
for col in optimized_df.select_dtypes(include='float64').columns:
	optimized_df[col]=optimized_df[col].astype('float32')

int64 to int_min:

In [14]:
for col in optimized_df.select_dtypes(include=['int64']).columns:
    optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='integer')

result:

In [15]:
print(optimized_df.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    float32
 2   Fines      930 non-null    float32
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int16  
 6   calc_1     930 non-null    float32
 7   calc_2     930 non-null    float32
 8   calc_3     930 non-null    float32
 9   calc_4     930 non-null    float32
 10  calc_5     930 non-null    float32
dtypes: float32(7), int16(1), object(3)
memory usage: 180.3 KB
None


##### 5.Use Category:

In [16]:
for col in optimized_df.select_dtypes(include=['object']).columns:
    optimized_df[col] = optimized_df[col].astype('category')

print(optimized_df.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CarNumber  930 non-null    category
 1   Refund     930 non-null    float32 
 2   Fines      930 non-null    float32 
 3   Make       930 non-null    category
 4   Model      919 non-null    category
 5   Year       930 non-null    int16   
 6   calc_1     930 non-null    float32 
 7   calc_2     930 non-null    float32 
 8   calc_3     930 non-null    float32 
 9   calc_4     930 non-null    float32 
 10  calc_5     930 non-null    float32 
dtypes: category(3), float32(7), int16(1)
memory usage: 80.8 KB
None


##### 6.Clean:

CHECK:

In [17]:
print('fines' in locals())
print('fines' in globals())
print('optimized_df' in locals())
print('optimized_df' in globals())

True
True
True
True


Clean:

In [20]:
%reset_selective -f fines optimized_df
gc.collect()

59

In [22]:
del fines
del optimized_df
gc.collect()

0

CHECK:

In [24]:
print('fines' in locals())
print('fines' in globals())
print('optimized_df' in locals())
print('optimized_df' in globals())

False
False
False
False


CHECK DATA:

In [None]:
	print(fines)
	print(optimized_df)

NameError: name 'fines' is not defined