In [2]:
from pyspark.sql.functions import mean, udf, col
import pandas as pd
import time

## import csv files into spark dataframes
Note: both files contain records from all 5 boroughs

In [3]:
df18v11 = spark.read.csv('test_data/dcp_pluto_18v11.csv', header=True)
df18v1 = spark.read.csv('test_data/18v1.csv', header=True)

## Change column names and joining dataframes
* since we are doing column comparison, we need to the column names of one df to avoid confusion
* in this case, we add "_1"_ to the name, e.g. "BBL" ==> "BBL_1"
* here we are doing an inner join because we only care about the BBL's that apeared in both dataframes

In [4]:
df18v11 = df18v11.select([col(A).alias(A+'_1') for A in df18v11.schema.names])

In [8]:
df18v1_names = df18v1.schema.names
df18v11_names = df18v11.schema.names
colnames = zip(df18v1_names, df18v11_names)

In [9]:
df18 = df18v1.join(df18v11, df18v1['BBL'] == df18v11['bbl_1'])

## Create UDF (user defined function) for column comparison
* by doing ```@udf``` decorator, we are basically telling spark to optimize this function for column wise paralellized operations (it's a black box to me) 

In [6]:
@udf
def compare(col1,col2):
    if col1!=col2:
        return 1
    else:
        return 0

## Generate results
1. generate the mismatch dummy column 
2. get rid of the original valued columns
3. calculate the mean (in this case represents the percentage of mismatches) and produce output

In [7]:
spark.sql('set spark.sql.caseSensitive=true')
for A,B in colnames:
    df18 = df18.withColumn(A+'%', compare(col(A),col(B)))\
               .drop(A,B)
results = df18.select(*[mean(col(A)).alias(A) for A in df18.schema.names])

## Convert spark dataframe to pandas dataframe 
so we can easily export as a csv or conduct further analysis/visualizations

In [8]:
start_time = time.time()
results_df = results.toPandas()
elapsed_time = time.time() - start_time

In [9]:
results_df

Unnamed: 0,Borough%,Block%,Lot%,CD%,CT2010%,CB2010%,SchoolDist%,Council%,ZipCode%,FireComp%,...,ZMCode%,Sanborn%,TaxMap%,EDesigNum%,APPBBL%,APPDate%,PLUTOMapID%,FIRM07_FLAG%,PFIRM15_FLAG%,Version%
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
elapsed_time

140.76924014091492