In [1]:
import pyspark.sql.functions as fn
#from pyspark.sql.types import StringType, IntegerType, DecimalType, FloatType, LongType, DoubleType, StructType, StructField
import pandas as pd

#Lets start with a naive dataframe comparison and work up

In [3]:
values_1 = {
  "String": ["one", "two", "three",],
  "Value": [1, 2, 3,],
}

values_2 = {
  "String": ["one", "two",],
  "Value": [1, 2, ],
}

df_1 = spark.createDataFrame(
  pd.DataFrame.from_dict(values_1)
)

df_2 = spark.createDataFrame(
  pd.DataFrame.from_dict(values_2)
)

In [4]:
# naive function
def are_dataframes_equal(df_actual, df_expected): 
  return df_actual.subtract(df_expected).rdd.isEmpty()

print(are_dataframes_equal(df_1, df_2)) # returns False correctly
print(are_dataframes_equal(df_2, df_1)) # returns True INCORRECTLY

In [5]:
# improved function
def are_dataframes_equal(df_actual, df_expected): 
  if df_actual.subtract(df_expected).rdd.isEmpty():
    return df_expected.subtract(df_actual).rdd.isEmpty()
  return False

print(are_dataframes_equal(df_1, df_2)) # returns False correctly
print(are_dataframes_equal(df_2, df_1)) # returns False correctly
print(are_dataframes_equal(df_2, df_2)) # returns True correctly

# What if we have duplicate rows?

In [7]:
values_3 = {
  "String": ["one", "two", "two",],
  "Value": [1, 2, 2,],
}

values_4 = {
  "String": ["one", "one", "two",],
  "Value": [1, 1, 2, ],
}

df_3 = spark.createDataFrame(
  pd.DataFrame.from_dict(values_3)
)

df_4 = spark.createDataFrame(
  pd.DataFrame.from_dict(values_4)
)

In [8]:
# this still fools our improved equality check
print(are_dataframes_equal(df_1, df_3)) # returns False correctly
print(are_dataframes_equal(df_2, df_3)) # returns True INCORRECTLY
print(are_dataframes_equal(df_4, df_3)) # returns True INCORRECTLY

In [9]:
# lets use a groupby to improve on this further

def are_dataframes_equal(df_actual, df_expected): 
  # sorts are needed in case if disordered columns
  a_cols = sorted(df_actual.columns)
  e_cols = sorted(df_expected.columns)
  # we don't know the column names so just count on the first column we find
  df_a = df_actual.groupby(a_cols).agg(fn.count(a_cols[1]))
  df_e = df_expected.groupby(e_cols).agg(fn.count(e_cols[1]))
  # then perform our equality checks on the dataframes with the row counts
  if df_a.subtract(df_e).rdd.isEmpty():
    return df_e.subtract(df_a).rdd.isEmpty()
  return False

In [10]:
# this still fools our improved equality check
print(are_dataframes_equal(df_1, df_3)) # returns False correctly
print(are_dataframes_equal(df_2, df_3)) # returns False correctly
print(are_dataframes_equal(df_4, df_3)) # returns False correctly
print(are_dataframes_equal(df_1, df_1)) # returns True correctly
print(are_dataframes_equal(df_4, df_4)) # returns True correctly

# Finally we need to check we can handle schema mismatches without crashing

In [12]:
# check differing column names works
values_5= {
  "String": ["one", "two", "three",],
  "Property": [1, 2, 3,],
}

df_5 = spark.createDataFrame(
  pd.DataFrame.from_dict(values_5)
)

# check differing column types work
values_6= {
  "String": [1,2,3,],
  "Value": [1, 2, 3,],
}

df_6 = spark.createDataFrame(
  pd.DataFrame.from_dict(values_6)
)


In [13]:
print(are_dataframes_equal(df_1, df_5)) # returns False correctly
print(are_dataframes_equal(df_1, df_6)) # returns False correctly

# Be aware that unsorted map columns may incorrectly return false when compared