Apply function to every row in a Pandas DataFrame :

In [3]:
import pandas as pd

# Function to add
def add_values(row):
	return row['A'] + row['B'] + row['C']

def main():
	# Create a dictionary with three fields each
	data = {
		'A': [1, 2, 3],
		'B': [4, 5, 6],
		'C': [7, 8, 9]}

	# Convert the dictionary into DataFrame
	df = pd.DataFrame(data)
	print("Original DataFrame:\n", df)

	# Apply the user-defined function to every row
	df['add'] = df.apply(add_values, axis=1)

	print('\nAfter Applying Function: ')
	# Print the new DataFrame
	print(df)

if __name__ == '__main__':
	main()


Original DataFrame:
    A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

After Applying Function: 
   A  B  C  add
0  1  4  7   12
1  2  5  8   15
2  3  6  9   18


In [4]:
# Import pandas package
import pandas as pd
# Function to add

def add(a, b, c):
	return a + b + c

def main():
	# create a dictionary with
	# three fields each
	data = {
		'A': [1, 2, 3],
		'B': [4, 5, 6],
		'C': [7, 8, 9]}

	# Convert the dictionary into DataFrame
	df = pd.DataFrame(data)
	print("Original DataFrame:\n", df)

	df['add'] = df.apply(lambda row: add(row['A'],
										row['B'], row['C']), axis=1)

	print('\nAfter Applying Function: ')
	# printing the new dataframe
	print(df)

if __name__ == '__main__':
	main()


Original DataFrame:
    A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

After Applying Function: 
   A  B  C  add
0  1  4  7   12
1  2  5  8   15
2  3  6  9   18


Apply NumPy.sum() to Every Row :

In [5]:
import pandas as pd
import numpy as np

def main():
	# create a dictionary with
	# five fields each
	data = {
		'A': [1, 2, 3],
		'B': [4, 5, 6],
		'C': [7, 8, 9]}

	# Convert the dictionary into DataFrame
	df = pd.DataFrame(data)
	print("Original DataFrame:\n", df)

	# applying function to each row in the dataframe
	# and storing result in a new column
	df['add'] = df.apply(np.sum, axis=1)

	print('\nAfter Applying Function: ')
	# printing the new dataframe
	print(df)

if __name__ == '__main__':
	main()


Original DataFrame:
    A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

After Applying Function: 
   A  B  C  add
0  1  4  7   12
1  2  5  8   15
2  3  6  9   18


Normalizing DataFrame Column Values Using Custom Function in Pandas :

In [6]:
# Import pandas package
import pandas as pd

def normalize(x, y):
	x_new = ((x - np.mean([x, y])) /
			(max(x, y) - min(x, y)))

	# print(x_new)
	return x_new

def main():
	# create a dictionary with three fields each
	data = {
		'X': [1, 2, 3],
		'Y': [45, 65, 89]}

	# Convert the dictionary into DataFrame
	df = pd.DataFrame(data)
	print("Original DataFrame:\n", df)

	df['X'] = df.apply(lambda row: normalize(row['X'],
											row['Y']), axis=1)

	print('\nNormalized:')
	print(df)

if __name__ == '__main__':
	main()


Original DataFrame:
    X   Y
0  1  45
1  2  65
2  3  89

Normalized:
     X   Y
0 -0.5  45
1 -0.5  65
2 -0.5  89


Applying Range Generation Function to DataFrame Rows in Pandas :

In [7]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

# Function to generate range

def generate_range(n):

	# printing the range for eg:
	# input is 67 output is 60-70
	n = int(n)

	lower_limit = n//10 * 10
	upper_limit = lower_limit + 10

	return str(str(lower_limit) + '-' + str(upper_limit))


def replace(row):
	for i, item in enumerate(row):

		# updating the value of the row
		row[i] = generate_range(item)
	return row

def main():
	# create a dictionary with
	# three fields each
	data = {
		'A': [0, 2, 3],
		'B': [4, 15, 6],
		'C': [47, 8, 19]}

	# Convert the dictionary into DataFrame
	df = pd.DataFrame(data)

	print('Before applying function: ')
	print(df)

	# applying function to each row in
	# dataframe and storing result in a new column
	df = df.apply(lambda row: replace(row))

	print('After Applying Function: ')
	# printing the new dataframe
	print(df)


if __name__ == '__main__':
	main()


Before applying function: 
   A   B   C
0  0   4  47
1  2  15   8
2  3   6  19
After Applying Function: 
      A      B      C
0  0-10   0-10  40-50
1  0-10  10-20   0-10
2  0-10   0-10  10-20


  row[i] = generate_range(item)
  row[i] = generate_range(item)
  row[i] = generate_range(item)


Pandas Series.apply() :

In [8]:
# importing pandas as pd 
import pandas as pd 

# Creating the Series 
sr = pd.Series(['New York', 'Chicago', 'Toronto', 'Lisbon', 'Rio']) 

# Create the Index 
index = ['City 1', 'City 2', 'City 3', 'City 4', 'City 5'] 

# set the index 
sr.index = index 

# Print the series 
print(sr) 


City 1    New York
City 2     Chicago
City 3     Toronto
City 4      Lisbon
City 5         Rio
dtype: object


In [9]:
# change 'Rio' to 'Montreal' 
# we have used a lambda function 
result = sr.apply(lambda x : 'Montreal' if x =='Rio' else x ) 

# Print the result 
print(result) 


City 1    New York
City 2     Chicago
City 3     Toronto
City 4      Lisbon
City 5    Montreal
dtype: object


In [10]:
# importing pandas as pd 
import pandas as pd 

# Creating the Series 
sr = pd.Series([11, 21, 8, 18, 65, 18, 32, 10, 5, 32, None]) 

# Create the Index 
# apply yearly frequency 
index_ = pd.date_range('2010-10-09 08:45', periods = 11, freq ='Y') 

# set the index 
sr.index = index_ 

# Print the series 
print(sr) 


2010-12-31 08:45:00    11.0
2011-12-31 08:45:00    21.0
2012-12-31 08:45:00     8.0
2013-12-31 08:45:00    18.0
2014-12-31 08:45:00    65.0
2015-12-31 08:45:00    18.0
2016-12-31 08:45:00    32.0
2017-12-31 08:45:00    10.0
2018-12-31 08:45:00     5.0
2019-12-31 08:45:00    32.0
2020-12-31 08:45:00     NaN
Freq: A-DEC, dtype: float64


In [11]:
# return True if greater than 30 
# else return False 
result = sr.apply(lambda x : True if x>30 else False) 

# Print the result 
print(result) 


2010-12-31 08:45:00    False
2011-12-31 08:45:00    False
2012-12-31 08:45:00    False
2013-12-31 08:45:00    False
2014-12-31 08:45:00     True
2015-12-31 08:45:00    False
2016-12-31 08:45:00     True
2017-12-31 08:45:00    False
2018-12-31 08:45:00    False
2019-12-31 08:45:00     True
2020-12-31 08:45:00    False
Freq: A-DEC, dtype: bool


Pandas DataFrame mean() Method :

In [12]:
# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
df = pd.DataFrame({"A":[12, 4, 5, 44, 1], 
				"B":[5, 2, 54, 3, 2], 
				"C":[20, 16, 7, 3, 8], 
				"D":[14, 3, 17, 2, 6]}) 

# Print the dataframe 
df 


Unnamed: 0,A,B,C,D
0,12,5,20,14
1,4,2,16,3
2,5,54,7,17
3,44,3,3,2
4,1,2,8,6


In [13]:
# Even if we do not specify axis = 0, 
# the method will return the mean over 
# the index axis by default 
df.mean(axis = 0) 


A    13.2
B    13.2
C    10.8
D     8.4
dtype: float64

In [15]:
# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
df = pd.DataFrame({"A":[12, 4, 5, None, 1], 
				"B":[7, 2, 54, 3, None], 
				"C":[20, 16, 11, 3, 8], 
				"D":[14, 3, None, 2, 6]}) 

# skip the Na values while finding the mean 
df.mean(axis = 1, skipna = True) 


0    13.250000
1     6.250000
2    23.333333
3     2.666667
4     5.000000
dtype: float64

Pandas Series.mean() :

In [17]:
# importing pandas as pd 
import pandas as pd 

# Creating the Series 
sr = pd.Series([10, 25, 3, 25, 24, 6]) 

# Create the Index 
index_ = ['Coca Cola', 'Sprite', 'Coke', 'Fanta', 'Dew', 'ThumbsUp'] 

# set the index 
sr.index = index_ 

# Print the series 
print(sr) 


Coca Cola    10
Sprite       25
Coke          3
Fanta        25
Dew          24
ThumbsUp      6
dtype: int64


In [18]:
# return the mean 
result = sr.mean() 

# Print the result 
print(result) 


15.5


In [19]:
# importing pandas as pd 
import pandas as pd 

# Creating the Series 
sr = pd.Series([19.5, 16.8, None, 22.78, 16.8, 20.124, None, 18.1002, 19.5]) 

# Print the series 
print(sr) 


0    19.5000
1    16.8000
2        NaN
3    22.7800
4    16.8000
5    20.1240
6        NaN
7    18.1002
8    19.5000
dtype: float64


In [20]:
# return the mean 
# skip all the missing values 
result = sr.mean(skipna = True) 

# Print the result 
print(result) 


19.086314285714284


Pandas dataframe.mad() :

In [21]:
# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
df = pd.DataFrame({"A":[12, 4, 5, 44, 1], 
				"B":[5, 2, 54, 3, 2], 
				"C":[20, 16, 7, 3, 8], 
				"D":[14, 3, 17, 2, 6]}) 

# Print the dataframe 
df 
# find the mean absolute deviation  
# over the index axis 
df.mad(axis = 0) 

Unnamed: 0,A,B,C,D
0,12,5,20,14
1,4,2,16,3
2,5,54,7,17
3,44,3,3,2
4,1,2,8,6


Pandas Series.mad() to calculate Mean Absolute Deviation of a Series :

In [None]:
# importing pandas module  
import pandas as pd  
    
# importing numpy module  
import numpy as np  
    
# creating list 
list =[5, 12, 1, 0, 4, 22, 15, 3, 9] 
  
# creating series 
series = pd.Series(list) 
  
# calling .mad() method 
result = series.mad() 
  
# display 
result 


Output:

5.876543209876543


Pandas dataframe.sem() :

In [29]:
# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
df = pd.read_csv("homeprices.csv") 

# Print the dataframe 
df 


Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [30]:
# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
df = pd.read_csv("homeprices.csv") 

# Calculate the standard error of 
# the mean of all the rows in dataframe 
df.sem(axis = 1, skipna = False) 


0    137282.772462
1    141000.178531
2              NaN
3    148449.653008
4    189667.919016
5    202159.468583
dtype: float64

Pandas Series.value_counts() :

In [31]:
# importing pandas as pd 
import pandas as pd 

# Creating the Series 
sr = pd.Series(['New York', 'Chicago', 'Toronto', 'Lisbon', 'Rio', 'Chicago', 'Lisbon']) 

# Print the series 
print(sr) 


0    New York
1     Chicago
2     Toronto
3      Lisbon
4         Rio
5     Chicago
6      Lisbon
dtype: object


In [32]:
# find the value counts 
sr.value_counts() 


Chicago     2
Lisbon      2
New York    1
Toronto     1
Rio         1
Name: count, dtype: int64

Pandas Index.value_counts() :

In [33]:
# importing pandas as pd
import pandas as pd

# Creating the index
idx = pd.Index(['Harry', 'Mike', 'Arther', 'Nick',
				'Harry', 'Arther'], name ='Student')

# Print the Index
print(idx)


Index(['Harry', 'Mike', 'Arther', 'Nick', 'Harry', 'Arther'], dtype='object', name='Student')


In [34]:
# find the count of unique values in the index
idx.value_counts()


Student
Harry     2
Arther    2
Mike      1
Nick      1
Name: count, dtype: int64

In [35]:
# importing pandas as pd
import pandas as pd

# Creating the index
idx = pd.Index([21, 10, 30, 40, 50, 10, 50])

# Print the Index
print(idx)


Index([21, 10, 30, 40, 50, 10, 50], dtype='int64')


In [36]:
# for finding the count of all 
# unique values in the index.
idx.value_counts()


10    2
50    2
21    1
30    1
40    1
Name: count, dtype: int64

Applying Lambda functions to Pandas Dataframe :

In [None]:
# importing pandas library
import pandas as pd

# creating and initializing a list
values= [['Rohan',455],['Elvish',250],['Deepak',495],
		['Soni',400],['Radhika',350],['Vansh',450]] 

# creating a pandas dataframe
df = pd.DataFrame(values,columns=['Name','Total_Marks'])

# Applying lambda function to find 
# percentage of 'Total_Marks' column 
# using df.assign()
df = df.assign(Percentage = lambda x: (x['Total_Marks'] /500 * 100))

# displaying the data frame
df
