# Modifying DataFrames

We will use the same data as we used before, [federal support to all Canadian Provinces and Territories](https://www.fin.gc.ca/fedprov/mtp-eng.asp) :

In [11]:
import pandas as pd
import numpy as np

prov_support = pd.read_csv('pandas_ex1.csv',
                           sep=',',
                           skiprows=1,  # skipping one row
                           header=None, # Set to None, since we are skipping the first row
                           names= ['province_name','province','2016','2017','2018'], # names of columns
                           index_col= 'province') # use column 'province' as the index

prov_support

Unnamed: 0_level_0,province_name,2016,2017,2018
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NL,Newfoundland and Labrador,724,734,750
PE,Prince Edward Island,584,601,638
NS,Nova Scotia,3060,3138,3201
NB,New Brunswick,2741,2814,2956
QC,Quebec,21372,22720,23749
ON,Ontario,21347,21101,21420
MB,Manitoba,3531,3675,3965
SK,Saskatchewan,1565,1613,1673
AB,Alberta,5772,5943,6157
BC,British Columbia,6482,6680,6925


## Add a column

In [12]:
# Create a new column '2016-2018 change'. The new column is always added to the right
prov_support['2016-2018 change'] = prov_support['2018'] - prov_support['2016']

prov_support

Unnamed: 0_level_0,province_name,2016,2017,2018,2016-2018 change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NL,Newfoundland and Labrador,724,734,750,26
PE,Prince Edward Island,584,601,638,54
NS,Nova Scotia,3060,3138,3201,141
NB,New Brunswick,2741,2814,2956,215
QC,Quebec,21372,22720,23749,2377
ON,Ontario,21347,21101,21420,73
MB,Manitoba,3531,3675,3965,434
SK,Saskatchewan,1565,1613,1673,108
AB,Alberta,5772,5943,6157,385
BC,British Columbia,6482,6680,6925,443


## Update a value

In [13]:
#  update the '2017' value for Ontario from 21101 to 22222
prov_support.loc['ON', '2017'] = 22222

prov_support

Unnamed: 0_level_0,province_name,2016,2017,2018,2016-2018 change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NL,Newfoundland and Labrador,724,734,750,26
PE,Prince Edward Island,584,601,638,54
NS,Nova Scotia,3060,3138,3201,141
NB,New Brunswick,2741,2814,2956,215
QC,Quebec,21372,22720,23749,2377
ON,Ontario,21347,22222,21420,73
MB,Manitoba,3531,3675,3965,434
SK,Saskatchewan,1565,1613,1673,108
AB,Alberta,5772,5943,6157,385
BC,British Columbia,6482,6680,6925,443


at[ ]

We can get the same result by using the at[] field which provides access to a single value. We will now change the value of 22222 back to 21101 for Ontario in 2017:

In [14]:
# get value of a cell
prov_support.at['ON', '2017']

22222

In [15]:
# set value of a data point back to 21101:
prov_support.at['ON', '2017'] = 21101

prov_support

Unnamed: 0_level_0,province_name,2016,2017,2018,2016-2018 change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NL,Newfoundland and Labrador,724,734,750,26
PE,Prince Edward Island,584,601,638,54
NS,Nova Scotia,3060,3138,3201,141
NB,New Brunswick,2741,2814,2956,215
QC,Quebec,21372,22720,23749,2377
ON,Ontario,21347,21101,21420,73
MB,Manitoba,3531,3675,3965,434
SK,Saskatchewan,1565,1613,1673,108
AB,Alberta,5772,5943,6157,385
BC,British Columbia,6482,6680,6925,443


## Delete a row or column

drop( )

In [16]:
# Deleting column '2016':
prov_support.drop('2016', axis=1) 

# *returns a copy of the DataFrame with '2016' removed

Unnamed: 0_level_0,province_name,2017,2018,2016-2018 change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NL,Newfoundland and Labrador,734,750,26
PE,Prince Edward Island,601,638,54
NS,Nova Scotia,3138,3201,141
NB,New Brunswick,2814,2956,215
QC,Quebec,22720,23749,2377
ON,Ontario,21101,21420,73
MB,Manitoba,3675,3965,434
SK,Saskatchewan,1613,1673,108
AB,Alberta,5943,6157,385
BC,British Columbia,6680,6925,443


In [17]:
# Deleting Ontario and Quebec from the DataFrame:
prov_support.drop(['ON', 'QC']) # axis=0 is the default parameter

Unnamed: 0_level_0,province_name,2016,2017,2018,2016-2018 change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NL,Newfoundland and Labrador,724,734,750,26
PE,Prince Edward Island,584,601,638,54
NS,Nova Scotia,3060,3138,3201,141
NB,New Brunswick,2741,2814,2956,215
MB,Manitoba,3531,3675,3965,434
SK,Saskatchewan,1565,1613,1673,108
AB,Alberta,5772,5943,6157,385
BC,British Columbia,6482,6680,6925,443
YT,Yukon,946,973,1006,60
NT,Northwest Territories,1281,1294,1319,38


When then drop() function is called, pandas creates a new DataFrame object, the original DataFrame is not modified. If we need to modify the original DataFrame, then we need to set a parameter inplace = True:

In [18]:
prov_support.drop('2016', axis=1, inplace= True)

In [19]:
prov_support

Unnamed: 0_level_0,province_name,2017,2018,2016-2018 change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NL,Newfoundland and Labrador,734,750,26
PE,Prince Edward Island,601,638,54
NS,Nova Scotia,3138,3201,141
NB,New Brunswick,2814,2956,215
QC,Quebec,22720,23749,2377
ON,Ontario,21101,21420,73
MB,Manitoba,3675,3965,434
SK,Saskatchewan,1613,1673,108
AB,Alberta,5943,6157,385
BC,British Columbia,6680,6925,443


# Applying functions

Here is the DataFrame that we start with:

In [20]:
prov_support

Unnamed: 0_level_0,province_name,2017,2018,2016-2018 change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NL,Newfoundland and Labrador,734,750,26
PE,Prince Edward Island,601,638,54
NS,Nova Scotia,3138,3201,141
NB,New Brunswick,2814,2956,215
QC,Quebec,22720,23749,2377
ON,Ontario,21101,21420,73
MB,Manitoba,3675,3965,434
SK,Saskatchewan,1613,1673,108
AB,Alberta,5943,6157,385
BC,British Columbia,6680,6925,443


In [21]:
# Defining a custom function to calculate the percent change:
def percent_change(years):
    yr2017, yr2018 = years
    return (yr2018 - yr2017)/yr2017 * 100

In [48]:
prov_support[['2017', '2018']].apply(percent_change, axis=1)


# axis parameter is for the axis along which the function is applided


# For example, for the row corresponding to Ontario:

# The values in the '2017' and '2018' columns are 21101 and 21420, respectively.
# These values are passed to the percent_change function as a tuple (21101, 21420)

province
NL    2.179837
PE    6.156406
NS    2.007648
NB    5.046198
QC    4.529049
ON    1.511777
MB    7.891156
SK    3.719777
AB    3.600875
BC    3.667665
YT    3.391572
NT    1.931994
NU    3.221731
dtype: float64

In [52]:
# # Create a new column for the values calculated above and add it to the DataFrame
prov_support['per_change'] = prov_support[['2017', '2018']].apply(percent_change, axis=1)
prov_support

Unnamed: 0_level_0,province_name,2017,2018,2016-2018 change,per_change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NL,Newfoundland and Labrador,734,750,26,2.179837
PE,Prince Edward Island,601,638,54,6.156406
NS,Nova Scotia,3138,3201,141,2.007648
NB,New Brunswick,2814,2956,215,5.046198
QC,Quebec,22720,23749,2377,4.529049
ON,Ontario,21101,21420,73,1.511777
MB,Manitoba,3675,3965,434,7.891156
SK,Saskatchewan,1613,1673,108,3.719777
AB,Alberta,5943,6157,385,3.600875
BC,British Columbia,6680,6925,443,3.667665


## map( )

The applymap( ) function is used to apply another function. For example, formatting all number columns as floating point numbers. We can also use the lambda function for this operation:

In [66]:

#! depcricated prov_support.loc[:, '2017':'per_change'].applymap(lambda x: '%.2f' %x) applymap() is depricated
prov_support.loc[:, '2017':'per_change'].map(lambda x: '%.2f' %x)

Unnamed: 0_level_0,2017,2018,2016-2018 change,per_change
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NL,734.0,750.0,26.0,2.18
PE,601.0,638.0,54.0,6.16
NS,3138.0,3201.0,141.0,2.01
NB,2814.0,2956.0,215.0,5.05
QC,22720.0,23749.0,2377.0,4.53
ON,21101.0,21420.0,73.0,1.51
MB,3675.0,3965.0,434.0,7.89
SK,1613.0,1673.0,108.0,3.72
AB,5943.0,6157.0,385.0,3.6
BC,6680.0,6925.0,443.0,3.67


In [67]:
prov_support['per_change'].map(lambda x: '%.2f' %x)

province
NL    2.18
PE    6.16
NS    2.01
NB    5.05
QC    4.53
ON    1.51
MB    7.89
SK    3.72
AB    3.60
BC    3.67
YT    3.39
NT    1.93
NU    3.22
Name: per_change, dtype: object

In [None]:
'''
Since applymap() is depricated, it might be better to just use map() 
whether applying a function to single column/row or multiple columns/rows
'''