In [21]:
from pyspark.sql.functions import mean, udf, col, round
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import datetime
print(datetime.datetime.now())
%matplotlib inline

2019-02-01 14:19:43.369716


## import csv files into spark dataframes
Note: both files contain records from all 5 boroughs

In [2]:
df1 = spark.read.csv('../data/pluto_18v2.csv', header=True)
df2 = spark.read.csv('../data/pluto_18v1.csv', header=True)

In [3]:
df1 = df1.select([col(A).alias(A.lower()) for A in df1.schema.names])
df2 = df2.select([col(A).alias(A.lower()) for A in df2.schema.names])

In [4]:
double_columns = ['bldgarea', 'facilfar',
                  'residfar', 'commfar', 'numbldgs', 'numfloors', 'bldgdepth', 
                  'bldgfront', 'lotdepth', 'lotfront', 
                  'exempttot', 'exemptland',  'assessland', 'assesstot', 'builtfar']

In [5]:
cols = df2.columns

In [6]:
df1 = df1.select(cols)
df2 = df2.select(cols)

## Type Conversion

In [7]:
for A in double_columns: 
    df1 = df1.withColumn(A, round(col(A).cast(DoubleType()), 2))
    df2 = df2.withColumn(A, round(col(A).cast(DoubleType()), 2))

## Change column names and joining dataframes
* since we are doing column comparison, we need to the column names of one df to avoid confusion
* in this case, we add "_1"_ to the name, e.g. "BBL" ==> "BBL_1"
* here we are doing an inner join because we only care about the BBL's that apeared in both dataframes

In [8]:
df1 = df1.select([col(A).alias(A+'_1') for A in df1.schema.names])

In [9]:
df1_names = df1.schema.names
df2_names = df2.schema.names
colnames = zip(df1_names, df2_names)

In [10]:
df = df2.join(df1, df2['bbl'] == df1['bbl_1'])
df0 = df

## Inspect top differences

In [11]:
@udf
def diff(col1,col2):
    try:
        return abs(col2-col1)
    except: 
        pass

In [12]:
spark.sql('set spark.sql.caseSensitive=true')
targets = ['unitsres','lotarea','bldgarea',
             'comarea','resarea',
             'officearea','retailarea',
             'garagearea','strgearea',
             'factryarea','otherarea']
for A in targets:
    df0 = df0.withColumn(A+'_diff', diff(col(A).cast(DoubleType()),col(A+'_1').cast(DoubleType())))

In [13]:
df0.select('bbl', col('unitsres_diff').cast(DoubleType())).orderBy('unitsres_diff', ascending=False).show(5)

+----------+-------------+
|       bbl|unitsres_diff|
+----------+-------------+
|1008317502|       2916.0|
|1007290050|        844.0|
|1002487501|        816.0|
|1010517502|        747.0|
|3001667502|        712.0|
+----------+-------------+
only showing top 5 rows



In [14]:
df0.select('bbl', col('comarea_diff').cast(DoubleType())).orderBy('comarea_diff', ascending=False).show(5)

+----------+------------+
|       bbl|comarea_diff|
+----------+------------+
|1007559040|   1378125.0|
|1005967502|    963445.0|
|5017600035|    904400.0|
|5009550100|    686130.0|
|3007150001|    679585.0|
+----------+------------+
only showing top 5 rows



In [15]:
df0.select('bbl', col('resarea_diff').cast(DoubleType())).orderBy('resarea_diff', ascending=False).show(5)

+----------+------------+
|       bbl|resarea_diff|
+----------+------------+
|1007297502|    858611.0|
|3001570001|    590018.0|
|2025260090|    475438.0|
|3024720050|    378392.0|
|4008907501|    346781.0|
+----------+------------+
only showing top 5 rows



In [16]:
df0.select('bbl', col('officearea_diff').cast(DoubleType())).orderBy('officearea_diff', ascending=False).show(5)

+----------+---------------+
|       bbl|officearea_diff|
+----------+---------------+
|1005967502|       963445.0|
|4000717501|       654962.0|
|4004207501|       505416.0|
|1000057501|       492719.0|
|1009220046|       331459.0|
+----------+---------------+
only showing top 5 rows



In [17]:
df0.select('bbl', col('retailarea_diff').cast(DoubleType()))\
    .orderBy('retailarea_diff', ascending=False).show(5)

+----------+---------------+
|       bbl|retailarea_diff|
+----------+---------------+
|5024000180|       317994.0|
|1021760017|       141600.0|
|3027800001|        86375.0|
|4097557501|        79076.0|
|3085900031|        55200.0|
+----------+---------------+
only showing top 5 rows



In [18]:
df0.select('bbl', col('garagearea_diff').cast(DoubleType()))\
    .orderBy('garagearea_diff', ascending=False).show(5)

+----------+---------------+
|       bbl|garagearea_diff|
+----------+---------------+
|4009260001|       368000.0|
|2051350210|       296715.0|
|1009910060|       294656.0|
|1012640005|       255714.0|
|3001570001|       248600.0|
+----------+---------------+
only showing top 5 rows



In [19]:
df0.select('bbl', col('factryarea_diff').cast(DoubleType()))\
    .orderBy('factryarea_diff', ascending=False).show(5)

+----------+---------------+
|       bbl|factryarea_diff|
+----------+---------------+
|5017600035|       855000.0|
|2026040270|       195551.0|
|5070900001|       140830.0|
|3053010001|       124954.0|
|2037087501|       121160.0|
+----------+---------------+
only showing top 5 rows



In [20]:
df0.select('bbl', col('otherarea_diff').cast(DoubleType()))\
    .orderBy('otherarea_diff', ascending=False).show(5)

+----------+--------------+
|       bbl|otherarea_diff|
+----------+--------------+
|1007559040|     1378125.0|
|3007150001|      679585.0|
|3078810033|      480000.0|
|3006440001|      427123.0|
|1009220046|      331459.0|
+----------+--------------+
only showing top 5 rows

