In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
# https://towardsdatascience.com/different-ways-to-iterate-over-rows-in-a-pandas-dataframe-performance-comparison-dc0d5dcef8fe

def my_compute(x):
    return x + 1

def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        kw['log_time'].append(int((te - ts) * 1000))
        return result
    return timed

In [3]:
# 1 - pandas column operation

@timeit
def use_column(dataset, col1, col2, **kw):
    dataset[col2] = dataset[col1] + 1

In [4]:
# 2 - pandas apply

@timeit
def use_apply(dataset, col1, col2, **kw):
    dataset[col2] = dataset.apply(my_compute)

In [5]:
# 3 - use loc indexing

@timeit
def use_loc(dataset, col1, col2, **kw):
    dataset[col2] = np.nan
    for i in range(len(dataset)):
        dataset.loc[i, col2] = my_compute(dataset.loc[i, col1])

In [6]:
# 4 - use at indexing - accessing single value

@timeit
def use_at(dataset, col1, col2, **kw):
    dataset[col2] = np.nan
    for i in range(len(dataset)):
        dataset.at[i, col2] = my_compute(dataset.at[i, col1])

In [7]:
# 5 - use iat indexing - accessing single value

@timeit
def use_iat(dataset, col1, col2, **kw):
    dataset[col2] = np.nan
    for i in range(len(dataset)):
        dataset.iat[i, dataset.columns.get_loc(col2)] = my_compute(dataset.iat[i, dataset.columns.get_loc(col1)])

In [8]:
# 6 - use numpy for loop

@timeit
def use_numpy_for_loop(dataset, col1, col2, **kw):
    temp = np.empty(len(dataset))
    # get numpy array from a column
    original = dataset[col1]
    # iterate and compute values
    for i in range(len(dataset)):
        temp[i] = my_compute(original[i])
    # assign values as new column to df
    dataset[col2] = temp

In [9]:
# 7 - use pandas iterrows

@timeit
def use_iterrows(dataset, col1, col2, **kw):
    temp = np.empty(len(dataset))
    # generator object to iterate
    for index, row in dataset.iterrows():
        temp = my_compute(row[col1])
    dataset[col2] = temp

In [10]:
# 8 - use zip

@timeit
def use_zip(dataset, col1, col2, **kw):
    temp = np.empty(len(dataset))
    for i, (x) in enumerate(zip(dataset[col1])):
        temp[i] = my_compute(x[0])
    dataset[col2] = temp

In [11]:
def time_this(func, method_name, N=1000):
    """ Execute the given function 100 times and measure the execution time for each run.
        Returns a dictionary containing the statistics based on the execution times
    """
    repeats = 100
    a = np.repeat(1000, N)
    pd_dataset = pd.DataFrame({'a': a})

    timing = []
    for i in range(repeats):
        func(pd_dataset.copy(), 'a', 'b', log_time=timing)
    return {'method': method_name, 'average': np.average(timing), 'min': np.min(timing), 'max': np.max(timing)}

In [12]:
def measure_time(dataset_size):
    all_timing = pd.DataFrame()
    all_timing = all_timing.append([time_this(use_column,'use_column')])
    all_timing = all_timing.append([time_this(use_apply,'use_pandas_apply')])

    all_timing = all_timing.append([time_this(use_loc,'use_for_loop_loc')])
    all_timing = all_timing.append([time_this(use_at,'use_for_loop_at')])
    all_timing = all_timing.append([time_this(use_iat,'use_for_loop_iat')])
    all_timing = all_timing.append([time_this(use_numpy_for_loop,'use_numpy_for_loop')])
    all_timing = all_timing.append([time_this(use_iterrows,'use_pandas_iterrows')])
    all_timing = all_timing.append([time_this(use_zip,'use_zip')])
    print(all_timing[['method', 'average', 'min', 'max']])
    
    
measure_time(1000)

                method  average  min  max
0           use_column     0.61    0    7
0     use_pandas_apply     2.19    2    6
0     use_for_loop_loc   652.28  618  825
0      use_for_loop_at    29.82   29   32
0     use_for_loop_iat    38.67   38   41
0   use_numpy_for_loop    21.26   21   23
0  use_pandas_iterrows   146.14  139  216
0              use_zip     1.00    1    1


1. Column operation and apply are both relatively fast.
2. Select using at() and iat() is faster than loc().
3. Location-based indexing on numpy array is faster than locating-based indexing on a pandas dataframe.

**Avoid using loc() for updating or accessing single value, use iat() and at() instead**

**Extract values as a numpy array then perform the processing/analysing**

**zip() Faster than iterrows()**

*https://www.tutorialspoint.com/python_pandas/python_pandas_iteration.htm*

Basic iteration (for i in object) produces:
1. Series - values
2. Dataframe - columns labels
3. Panel - item labels

In [13]:
N = 20
df = pd.DataFrame({
    'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
    'x': np.linspace(start=0, stop=N-1, num=N),
    'y': np.random.rand(N),
    'C': np.random.choice(['Low', 'Medium', 'High'], N).tolist(),
    'B': np.random.normal(100, 10, size=(N)).tolist()
})

for col in df:
    print(col)
    
display(df.head())

A
x
y
C
B


Unnamed: 0,A,x,y,C,B
0,2016-01-01,0.0,0.280728,Low,94.781248
1,2016-01-02,1.0,0.453214,Medium,90.575415
2,2016-01-03,2.0,0.201301,Low,87.349228
3,2016-01-04,3.0,0.595245,Medium,88.583258
4,2016-01-05,4.0,0.089842,Medium,100.939415


To iterate over rows in DataFrame:
1. iterrows() - iterate over (index, series) pairs (slow)
2. iteritems() - iterate over each column as (key, value) pairs - key:label, value: Series
3. itertuples() - iterate over rows as namedtuples

In [None]:
for idx, series in df.iterrows():
    print(f'Index:{idx}\nSeries:\n{series}')
    print(f'Index type: {type(idx)}, row type: {type(row)}\n')
    
"""
Index:0
Series:
A    2016-01-01 00:00:00
x                      0
y               0.620163
C                    Low
B                 98.594
Name: 0, dtype: object
Index type: <class 'int'>, row type: <class 'pandas.core.series.Series'>

Index:1
Series:
A    2016-01-02 00:00:00
x                      1
y               0.899402
C                 Medium
B                80.3832
Name: 1, dtype: object
Index type: <class 'int'>, row type: <class 'pandas.core.series.Series'>
"""

for key, value in df.iteritems():
    print(f'Key: {key}\nValue:\n{value}')
    print(f'Key type: {type(key)}, value type: {type(value)}')

"""
Key: A
Value:
0    2016-01-01
1    2016-01-02
2    2016-01-03
3    2016-01-04
4    2016-01-05
5    2016-01-06
6    2016-01-07
7    2016-01-08
8    2016-01-09
9    2016-01-10
10   2016-01-11
11   2016-01-12
12   2016-01-13
13   2016-01-14
14   2016-01-15
15   2016-01-16
16   2016-01-17
17   2016-01-18
18   2016-01-19
19   2016-01-20
Name: A, dtype: datetime64[ns]
Key type: <class 'str'>, value type: <class 'pandas.core.series.Series'>
"""

for row in df.itertuples():
    print(f'Row: {row}\nType:{type(row)}\n')
    
"""
Row: Pandas(Index=0, A=Timestamp('2016-01-01 00:00:00'), x=0.0, y=0.6201632475317321, C='Low', B=98.59404442587926)
Type:<class 'pandas.core.frame.Pandas'>
"""