In [1]:
from pyspark.sql.functions import mean, udf, col, round
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
import pandas as pd
import numpy as np
import time
import datetime
import matplotlib.pyplot as plt

%matplotlib inline
now = datetime.datetime.now()
print(now)
os.environ["PYSPARK_PYTHON"]="/anaconda3/bin/python3"
df1 = spark.read.csv('data/pluto_18v2.csv', header=True)
df2 = spark.read.csv('data/pluto.csv', header=True)

df1 = df1.select([col(A).alias(A.lower()) for A in df1.schema.names])
df2 = df2.select([col(A).alias(A.lower()) for A in df2.schema.names])

double_columns = ['bldgarea', 'facilfar','residfar', 'commfar', 
                  'numbldgs', 'numfloors', 'bldgdepth','bldgfront', 
                  'lotdepth', 'lotfront','exemptland', 'exempttot', 
                  'assessland', 'assesstot','builtfar']
cols = df2.columns
df1 = df1.select(cols)
df2 = df2.select(cols)

for A in double_columns: 
    df1 = df1.withColumn(A, round(col(A).cast(DoubleType()), 2))
    df2 = df2.withColumn(A, round(col(A).cast(DoubleType()), 2))
    

df1 = df1.select([col(A).alias(A+'_1') for A in df1.schema.names])

colnames = zip(df1.columns, df2.columns)

df = df2.join(df1, df2['bbl'] == df1['bbl_1'])

2019-03-28 09:39:53.225574


In [2]:
@udf
def compare(col1,col2):
    if col1!=col2:
        return 1
    else:
        return 0
    
access_cols = ['exemptland', 'exempttot', 'assessland', 'assesstot']

@udf #special access column comparison
def compare_a(col1,col2):
    try: 
        if abs(col1 - col2) > 10: 
            return 1
        else: 
            return 0
    except: 
        return 0

In [3]:
spark.sql('set spark.sql.caseSensitive=true')
for A,B in colnames:
    if B in access_cols:
        df = df.withColumn(B+'%', compare_a(col(A),col(B)))\
               .drop(A,B)
    else:
        df = df.withColumn(B+'%', compare(col(A),col(B)))\
                   .drop(A,B)
        
results = df.select(*[mean(col(A)).alias(A) for A in df.columns])

In [4]:
start_time = time.time()
results_df = results.toPandas()
elapsed_time = time.time() - start_time

In [5]:
results_df.iloc[0,:].sort_values(ascending=False)[0:30]

bldgarea%      0.007035
areasource%    0.002527
geom%          0.000000
resarea%       0.000000
splitzone%     0.000000
bldgclass%     0.000000
landuse%       0.000000
easements%     0.000000
ownertype%     0.000000
ownername%     0.000000
lotarea%       0.000000
comarea%       0.000000
officearea%    0.000000
lotdepth%      0.000000
retailarea%    0.000000
garagearea%    0.000000
strgearea%     0.000000
factryarea%    0.000000
otherarea%     0.000000
numbldgs%      0.000000
numfloors%     0.000000
unitsres%      0.000000
unitstotal%    0.000000
ltdheight%     0.000000
spdist3%       0.000000
spdist2%       0.000000
policeprct%    0.000000
block%         0.000000
lot%           0.000000
cd%            0.000000
Name: 0, dtype: float64