![alt text](pandas.png "Title")

In [1]:
import pandas as pd

# Comparison of dataframes

Let's see how we can compare dataframes together.

## Test data

df

In [2]:
patients = [10010, 10011, 10012]
data = {'gender': ['M', 'F', 'F'],
        'age':    [20, 25, 23],
       }
df = pd.DataFrame(data, index= patients, columns=['age', 'gender'])
df

Unnamed: 0,age,gender
10010,20,M
10011,25,F
10012,23,F


df2

In [3]:
# Same as df with one different value in each col
patients = [10010, 10011, 10012]
data = {'gender': ['F', 'F', 'F'],
        'age':    [20, 24, 23],
       }
df2 = pd.DataFrame(data, index= patients, columns=['age', 'gender'])
df2

Unnamed: 0,age,gender
10010,20,F
10011,24,F
10012,23,F


df3

In [4]:
# Identical to df2 except a different column index order
df3 = pd.DataFrame(data, index= patients, columns=['gender', 'age'])
df3

Unnamed: 0,gender,age
10010,F,20
10011,F,24
10012,F,23


df4

In [5]:
# Identical to df2 except a different row index sort
print('df2 id=', id(df2))
df4 = df2
print('df4 id=', id(df4)) # df4 is just a view of df2
df4 = df4.sort_index(ascending=False)
print('new df4 id=', id(df4)) # df4 is not a view anymore
df4

df2 id= 139791248411712
df4 id= 139791248411712
new df4 id= 139791248412048


Unnamed: 0,age,gender
10012,23,F
10011,24,F
10010,20,F


## Element-wise

In [6]:
# Element-wise comparison of identically-labeled dataframes
df == df2 # Returns a dataframe of booleans

Unnamed: 0,age,gender
10010,True,False
10011,False,True
10012,True,True


In [7]:
# Do we have a full match in there?
(df==df2).values

array([[ True, False],
       [False,  True],
       [ True,  True]])

In [8]:
(df==df2).values.all()

False

In [8]:
# that fails because the dataframe are not aligned
df2==df4

ValueError: Can only compare identically-labeled DataFrame objects

## Dataframe-wise

In [9]:
# Are the two dataframes identical?
print (df.equals(df2))  # of course not, different values.
print (df2.equals(df3)) # no, and that's because the column index are not sorted the same way
print (df2.equals(df4)) # no, and that's because the row    index are not sorted the same way

False
False
False


In [9]:
# Example of a useful utility

def proc_compare(
    df1 :pd.DataFrame,
    df2 :pd.DataFrame,
) ->  bool:
    ''' Performs a comparison of 2 dataframes, aligning indexes if requested. '''
    
    if len(df1) != len(df2):
        print(f'Dataframes have different sizes: {len(df1)} vs {len(df2)} ')
        return False
    
    elif sorted(list(df1.index)) != sorted(list(df2.index)) :
        diff = set(df1.index).symmetric_difference(set(df2.index))
        print(f'Dataframes have different row indexes: {diff}')
        return False
    
    elif sorted(df1.columns.tolist()) != sorted(df2.columns.tolist()):
        diff = set(df1.columns).difference(set(df2.columns))
        print(f'Dataframes have different columns: {diff}')
        return False
    
    else: # same shape and same index
    
        _df1 = df1.copy()
        _df2 = df2.copy()

        # Sort both dataframes 
        _df1 = _df1.sort_index()
        _df2 = _df2.sort_index()

        # Apply df1 column order onto _df2
        _df2 = _df2[df1.columns.tolist()]

        return df1.equals(_df2)

print ('Are df and df2 equal?',  proc_compare(df, df2))
print ('Are df2 and df3 equal?', proc_compare(df2, df3))
print ('Are df2 and df4 equal?', proc_compare(df2, df4))

Are df and df2 equal? False
Are df2 and df3 equal? True
Are df2 and df4 equal? True


__________________________________________________
Nicolas Dupuis, Methodology and Innovation (IDAR C&SP), 2020+