# Reviewing the apply function
You can check the performance of a code using `%%timeit` magic cell function.<br>
`%%time` will give runtime for one iteration.<br>
You should alway give priority to numpy implmentation.<br>
If you are getting almost same performance using dataframes class then go for it as to avoid extra layer.
You should avoid apply function for a large dataframe.<br>

In [1]:
# Create sample dataframe
import pandas as pd
import random

def create_sample(num_of_rows=1000):
    data = {
        'x1' : [random.randint(1,10) for x in range(num_of_rows)],
        'x2' : [random.randint(1,100)*1.0 for x in range(num_of_rows)],
        'x3' : [random.randint(1,1000)*1.0 for x in range(num_of_rows)]
    }
    df = pd.DataFrame(data)
    return df

In [4]:
df = create_sample(1000)

In [5]:
df['y'] = df['x1'] + df['x2']
df.head()

Unnamed: 0,x1,x2,x3,y
0,9,20.0,274.0,29.0
1,3,4.0,461.0,7.0
2,6,28.0,427.0,34.0
3,1,46.0,322.0,47.0
4,3,60.0,294.0,63.0


In [6]:
%%timeit
#Get max of each column of Pandas Dataframe
df.max().to_frame().T

1.38 ms ± 14.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
%%timeit
df.apply(max, axis=1)

8.13 ms ± 17.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
%%timeit
df["max_x1_x2"] = df[["x1", "x2"]].max(axis=1)

1.23 ms ± 5.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
df.head(5)

Unnamed: 0,x1,x2,x3,y,max_x1_x2
0,9,20.0,274.0,29.0,20.0
1,3,4.0,461.0,7.0,4.0
2,6,28.0,427.0,34.0,28.0
3,1,46.0,322.0,47.0,46.0
4,3,60.0,294.0,63.0,60.0


## When working with Pandas remember that it was built on the Numpy package
You should always give preference to numpy implementation.

In [10]:
import numpy as np

#  4 different ways of get the maximum of two colums
df['y2'] = df[['x1', 'x2']].apply(max, axis = 1) # Using the apply function
df['y1'] = df.apply(lambda df: max(df['x1'], df['x2']), axis = 1)  # Using the apply function and specify the the variables with a lambda

df['y3'] = np.max(df[['x1', 'x2']], axis = 1) # Using a function from the numpy library
df['y4'] = df[['x1', 'x2']].max(axis = 1) # Usng the max function defined in the dataframes class
df.head()

Unnamed: 0,x1,x2,x3,y,max_x1_x2,y2,y1,y3,y4
0,9,20.0,274.0,29.0,20.0,20.0,20.0,20.0,20.0
1,3,4.0,461.0,7.0,4.0,4.0,4.0,4.0,4.0
2,6,28.0,427.0,34.0,28.0,28.0,28.0,28.0,28.0
3,1,46.0,322.0,47.0,46.0,46.0,46.0,46.0,46.0
4,3,60.0,294.0,63.0,60.0,60.0,60.0,60.0,60.0


In [11]:
%%timeit
df['y2'] = df[['x1', 'x2']].apply(max, axis = 1) # Using the apply function

8.84 ms ± 27.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%%time
df['y2'] = df[['x1', 'x2']].apply(max, axis = 1) # Using the apply function

CPU times: user 11.1 ms, sys: 57 µs, total: 11.2 ms
Wall time: 10.5 ms


In [13]:
%%time
df['y1'] = df.apply(lambda df: max(df['x1'], df['x2']), axis = 1)

CPU times: user 15.8 ms, sys: 0 ns, total: 15.8 ms
Wall time: 15.3 ms


In [14]:
%%time
df['y3'] = np.max(df[['x1', 'x2']], axis = 1) # Using a function from the numpy library

CPU times: user 1.97 ms, sys: 8 µs, total: 1.98 ms
Wall time: 1.73 ms


In [15]:
%%time
df['y4'] = df[['x1', 'x2']].max(axis = 1) # Using the max function defined in the dataframes class

CPU times: user 1.89 ms, sys: 14 µs, total: 1.9 ms
Wall time: 1.7 ms


In [16]:
df=create_sample(1000)
df.shape
df.head(5)

Unnamed: 0,x1,x2,x3
0,4,6.0,232.0
1,5,12.0,891.0
2,2,8.0,573.0
3,4,29.0,565.0
4,3,14.0,509.0


In [18]:
import math
# 3 different was to create a column that is equal to an existing column to the power of another existing column
df['y1'] = df.apply(lambda x: math.pow(x['x1'], x['x2']), axis = 1)  # looping over the dataframe and apply the math pow function to each row
df['y2'] = np.power(df['x1'], df['x2'])  # Using the numpy implementation of the power function
df['y3'] = df['x1'] ** df['x2']   #  Using vectorisation
df

Unnamed: 0,x1,x2,x3,y1,y2,y3
0,4,6.0,232.0,4.096000e+03,4.096000e+03,4.096000e+03
1,5,12.0,891.0,2.441406e+08,2.441406e+08,2.441406e+08
2,2,8.0,573.0,2.560000e+02,2.560000e+02,2.560000e+02
3,4,29.0,565.0,2.882304e+17,2.882304e+17,2.882304e+17
4,3,14.0,509.0,4.782969e+06,4.782969e+06,4.782969e+06
...,...,...,...,...,...,...
995,3,53.0,176.0,1.938325e+25,1.938325e+25,1.938325e+25
996,9,66.0,65.0,9.550050e+62,9.550050e+62,9.550050e+62
997,8,73.0,770.0,8.424983e+65,8.424983e+65,8.424983e+65
998,7,10.0,836.0,2.824752e+08,2.824752e+08,2.824752e+08


In [19]:
%%timeit
df['y1'] = df.apply(lambda x: math.pow(x['x1'], x['x2']), axis = 1)  # looping over the dataframe and apply the math pow function to each row

14 ms ± 29.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
%%timeit
df['y2'] = np.power(df['x1'], df['x2'])  # Using the numpy implementation of the power function

471 µs ± 434 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [21]:
%%timeit
df['y3'] = df['x1'] ** df['x2']   #  Using vectorisation

463 µs ± 4.41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
# Re-Create test data for the next example
df = create_sample(1000)
df.head(5)

Unnamed: 0,x1,x2,x3
0,7,7.0,446.0
1,2,46.0,753.0
2,6,72.0,996.0
3,2,99.0,970.0
4,9,17.0,362.0


In [23]:
# 3 different was of calculating the sin(x1 + x2)
#df['y'] = math.sin(df['x1'] + df['x2'])  # This does not work because you cannot pass columns into the math.sin function
df['y1'] = df.apply(lambda x: math.sin(x['x1'] + x['x2']), axis = 1)  #  Apply the math.sin function row by row
df['y2'] = (df['x1'] + df['x2']).apply(math.sin)  #  First create a column that is the sum of x1 and x2 and then apply the sin function
df['y3'] = np.sin(df['x1']+df['x2'])   #  Use the numply implementation of the sin function
df

Unnamed: 0,x1,x2,x3,y1,y2,y3
0,7,7.0,446.0,0.990607,0.990607,0.990607
1,2,46.0,753.0,-0.768255,-0.768255,-0.768255
2,6,72.0,996.0,0.513978,0.513978,0.513978
3,2,99.0,970.0,0.452026,0.452026,0.452026
4,9,17.0,362.0,0.762558,0.762558,0.762558
...,...,...,...,...,...,...
995,10,34.0,663.0,0.017702,0.017702,0.017702
996,6,33.0,191.0,0.963795,0.963795,0.963795
997,5,80.0,334.0,-0.176076,-0.176076,-0.176076
998,9,71.0,676.0,-0.993889,-0.993889,-0.993889


In [24]:
%%timeit
df['y1'] = df.apply(lambda x: math.sin(x['x1'] + x['x2']), axis = 1)  #  Apply the math.sin function row by row

13.9 ms ± 9.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [25]:
%%timeit
df['y2'] = (df['x1'] + df['x2']).apply(math.sin)  #  First create a column that is the sum of x1 and x2 and then apply the sin function

893 µs ± 10.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [26]:
%%timeit
df['y3'] = np.sin(df['x1']+df['x2'])   #  Use the numply implementation of the sin function.  Best way!!!!

573 µs ± 419 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [27]:
import datetime as dt

multiplier = 1000
data = {
    'year' : [2018, 2019, 2020]*multiplier,
    'month' : [1, 2, 3]*multiplier,
    'day': [1, 10, 30]*multiplier
}

df = pd.DataFrame(data)
print(df.shape)
df.head(5)

(3000, 3)


Unnamed: 0,year,month,day
0,2018,1,1
1,2019,2,10
2,2020,3,30
3,2018,1,1
4,2019,2,10


In [28]:
%%timeit
#  Is this the only way to apply function. 1st option
df['date'] = df.apply(lambda df : dt.datetime(df['year'], df['month'], df['day']), axis = 1)
df

61 ms ± 231 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
%%timeit
# You can use pd.to_datetime function.2nd option Approx 30 times faster
df['date2'] = pd.to_datetime(df[['year', 'month', 'day']])
df

3.77 ms ± 3.98 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
df.head(5)

Unnamed: 0,year,month,day,date,date2
0,2018,1,1,2018-01-01,2018-01-01
1,2019,2,10,2019-02-10,2019-02-10
2,2020,3,30,2020-03-30,2020-03-30
3,2018,1,1,2018-01-01,2018-01-01
4,2019,2,10,2019-02-10,2019-02-10


In [31]:
# Re-Create test data for the next example
df = create_sample(1000)
df.head()

Unnamed: 0,x1,x2,x3
0,1,7.0,689.0
1,2,31.0,858.0
2,6,25.0,414.0
3,3,49.0,883.0
4,6,86.0,833.0


In [32]:
# Applying a user defined function
def my_func(a, b, c):
    ans = (a + b) * c
    return ans

In [33]:
%%timeit
df['y1'] = df.apply(lambda df: my_func(df['x1'], df['x2'], df['x3']), axis = 1) 

17.6 ms ± 28.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
df.head(5)

Unnamed: 0,x1,x2,x3,y1
0,1,7.0,689.0,5512.0
1,2,31.0,858.0,28314.0
2,6,25.0,414.0,12834.0
3,3,49.0,883.0,45916.0
4,6,86.0,833.0,76636.0


In [35]:
def my_func2(df,col1,col2,col3,col_to_create):
    df[col_to_create] = (df[col1] +df[col2])*df[col3]
    return df

In [36]:
%%timeit
my_func2(df,"x1","x2","x3","y1")

640 µs ± 16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [37]:
df = my_func2(df,"x1","x2","x3","y1")
df.head(5)

Unnamed: 0,x1,x2,x3,y1
0,1,7.0,689.0,5512.0
1,2,31.0,858.0,28314.0
2,6,25.0,414.0,12834.0
3,3,49.0,883.0,45916.0
4,6,86.0,833.0,76636.0


In [38]:
# Re-Create test data for the next example
multiplier = 1000
data = {
    'x1' : [1.0, 3.0, 5.0]*multiplier,
    'x2' : [6.0, 2.0, 2.0]*multiplier,
    'x3' : [4.0, 4.0, 3.0]*multiplier,
    'x4' : ['A', 'B', 'C']*multiplier
}
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,x1,x2,x3,x4
0,1.0,6.0,4.0,A
1,3.0,2.0,4.0,B
2,5.0,2.0,3.0,C
3,1.0,6.0,4.0,A
4,3.0,2.0,4.0,B


In [39]:
# Apply a very simple two factor scorecard
def my_scorecard(a, b):
      if a < 2:
        scoreA = 20
      elif a < 4:
        scoreA = 40
      else:
        scoreA = 60
    
      if b in ['A', 'B']:
        scoreB = 10
      else:
        scoreB = 50
    
      score = scoreA + scoreB
      return score

In [40]:
%%timeit
df["y1"]=df.apply(lambda x: my_scorecard(x['x1'], x['x4']), axis = 1) 

40.1 ms ± 920 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [41]:
df.head()

Unnamed: 0,x1,x2,x3,x4,y1
0,1.0,6.0,4.0,A,30
1,3.0,2.0,4.0,B,50
2,5.0,2.0,3.0,C,110
3,1.0,6.0,4.0,A,30
4,3.0,2.0,4.0,B,50


In [42]:
def my_scorecard2(df):
    condlist1 = [df["x1"] < 2, df["x1"] < 4 ]
    choicelist1 = [20, 40]
    condlist2 = [df["x4"].isin(["A","B"])]
    choicelist2 = [10]
    df["y2"] =np.select(condlist1, choicelist1,60) + np.select(condlist2, choicelist2,50)
    return df

In [43]:
%%timeit
my_scorecard2(df)

1.06 ms ± 596 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
