In [1]:
from pyspark.sql.functions import sum, mean, udf, col, round, max, greatest, count
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import rank, col
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# ! curl -O https://raw.githubusercontent.com/NYCPlanning/db-pluto/master/pluto_build/data/dcp_zoning_maxfar.csv
# ! mv dcp_zoning_maxfar.csv data/dcp_zoning_maxfar.csv

In [3]:
df = spark.read.csv('data/pluto.csv', header=True)
new_far = spark.read.csv('data/dcp_zoning_maxfar.csv', header=True)
df = df.select([col(A).alias(A.lower()) for A in df.columns])

#type conversion, '-' --> null
for A in ['residfar','commfar']:
    new_far = new_far.withColumn(A, col(A).cast(DoubleType()))
    df = df.withColumn(A, col(A).cast(DoubleType()))

#create two copies
new_far1 = new_far.select([col(A).alias(A.lower()+'_1') for A in ['zonedist','residfar','commfar']])
new_far2 = new_far.select([col(A).alias(A.lower()+'_2') for A in ['zonedist','residfar','commfar']])

In [4]:
@udf
def pick_value(A,B):
    if not A: 
        return B
    else: 
        return A

In [7]:
df = df.join(new_far1, df['zonedist1'] == new_far1['zonedist_1'], how='left')\
       .join(new_far2, df['zonedist2'] == new_far2['zonedist_2'], how='left')\
       .withColumn('residfar_new', pick_value(col('residfar_1'), col('residfar_2')))\
       .withColumn('commfar_new', pick_value(col('commfar_1'), col('commfar_2')))\
       .withColumn('maxfar', greatest(col('residfar'), col('commfar')))\
       .withColumn('pctunbuilt', ((col('maxfar') - col('builtfar'))/col('maxfar')))\
       .withColumn('maxfar_new', greatest(col('residfar_new'), col('commfar_new')))\
       .withColumn('pctunbuilt_new', ((col('maxfar_new') - col('builtfar'))/col('maxfar_new')))\
       .filter(col('unitsres').cast(DoubleType()) <= 6)\
       .filter(col('landmark').isNull())\
       .filter(col('irrlotcode') != 'Y')\
       .filter(~col ('bldgclass').like('M%'))\
       .filter(col('landuse') != '08')\
       .filter(col('easements').cast(DoubleType()) <= 0)\

In [127]:
df.select('maxfar', 'maxfar_new')\
  .filter(col('maxfar') < col('maxfar_new')).agg(count('maxfar_new').alias('increase')).show()

+--------+
|increase|
+--------+
|    1472|
+--------+



In [153]:
df.select('maxfar', 'maxfar_new')\
  .filter((col('maxfar') > col('maxfar_new')) &
          (col('pctunbuilt') > 0.5))\
    .agg(count('maxfar_new').alias('increase'))\
.show()

+--------+
|increase|
+--------+
|   43285|
+--------+



In [154]:
df.select('maxfar', 'maxfar_new')\
  .filter((col('maxfar') > col('maxfar_new')) &
          (col('pctunbuilt') == 0.5))\
    .agg(count('maxfar_new').alias('increase'))\
.show()

+--------+
|increase|
+--------+
|    2821|
+--------+



In [155]:
df.select('maxfar', 'maxfar_new')\
  .filter((col('maxfar') > col('maxfar_new')) &
          (col('pctunbuilt') < 0.5))\
    .agg(count('maxfar_new').alias('increase'))\
.show()

+--------+
|increase|
+--------+
|  268385|
+--------+



In [15]:
@udf
def Larger(A,B): 
    try:
        if float(A)>float(B): 
            return 1
        else: 
            return 0
    except: 
        pass

@udf
def Equal(A,B):
    try:
        if float(A)==float(B): 
            return 1
        else: 
            return 0
    except: 
        pass
        return 0
    
@udf
def Less(A,B):
    try:
        if float(A)<float(B): 
            return 1
        else: 
            return 0
    except: 
        pass

In [16]:
df.select('maxfar', 'maxfar_new')\
  .withColumn('Larger', Larger(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .withColumn('Equal', Equal(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .withColumn('Less', Less(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .agg(sum(col('Larger')).alias('decrease'),
      sum(col('Equal')).alias('no change'),
      sum(col('Less')).alias('increase')).show()

+--------+---------+--------+
|decrease|no change|increase|
+--------+---------+--------+
|315182.0| 356009.0|  1472.0|
+--------+---------+--------+



In [21]:
df.select('maxfar', 'maxfar_new')\
  .filter(col('pctunbuilt') > 0.5)\
  .withColumn('Larger', Larger(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .withColumn('Equal', Equal(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .withColumn('Less', Less(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .agg(sum(col('Larger')).alias('decrease'),
      sum(col('Equal')).alias('no change'),
      sum(col('Less')).alias('increase')).show()

+--------+---------+--------+
|decrease|no change|increase|
+--------+---------+--------+
| 43285.0|  92399.0|   758.0|
+--------+---------+--------+



In [22]:
df.select('maxfar', 'maxfar_new')\
  .filter(col('pctunbuilt') == 0.5)\
  .withColumn('Larger', Larger(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .withColumn('Equal', Equal(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .withColumn('Less', Less(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .agg(sum(col('Larger')).alias('decrease'),
      sum(col('Equal')).alias('no change'),
      sum(col('Less')).alias('increase')).show()

+--------+---------+--------+
|decrease|no change|increase|
+--------+---------+--------+
|  2821.0|   3655.0|    30.0|
+--------+---------+--------+



In [23]:
df.select('maxfar', 'maxfar_new')\
  .filter(col('pctunbuilt') < 0.5)\
  .withColumn('Larger', Larger(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .withColumn('Equal', Equal(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .withColumn('Less', Less(col('maxfar').cast(DoubleType()),col('maxfar_new')))\
  .agg(sum(col('Larger')).alias('decrease'),
      sum(col('Equal')).alias('no change'),
      sum(col('Less')).alias('increase')).show()

+--------+---------+--------+
|decrease|no change|increase|
+--------+---------+--------+
|268385.0| 259160.0|   602.0|
+--------+---------+--------+



In [115]:
df.filter(col('maxfar') < col('maxfar_new'))\
    .select('zonedist1', 'maxfar', 'maxfar_new').distinct().show(df.count(), False)

+---------+------+----------+
|zonedist1|maxfar|maxfar_new|
+---------+------+----------+
|R3-2     |0.6   |1.0       |
|M1-6/R9  |0.0   |7.52      |
|R7-1     |3.44  |4.0       |
|R3-2     |0.6   |2.0       |
|R5       |1.25  |3.0       |
|R8B      |4.0   |6.0       |
|C8-3     |2.0   |3.44      |
|R3A      |0.6   |1.0       |
|C8-2     |2.0   |3.0       |
|R7-1     |3.44  |5.0       |
|R5D      |2.0   |3.4       |
|R2A      |0.5   |2.0       |
|PARK     |0.0   |3.0       |
|PARK     |0.0   |0.75      |
|R5B      |1.35  |4.0       |
|M1-2     |2.0   |6.02      |
|R3X      |0.6   |2.0       |
|R6B      |2.0   |3.4       |
|M1-1     |1.0   |1.35      |
|R6B      |2.0   |4.2       |
|M1-2     |2.0   |2.43      |
|R3-1     |0.6   |1.0       |
|C8-1     |1.0   |2.0       |
|R5       |1.25  |2.0       |
|R3A      |0.6   |3.4       |
|M1-4     |2.0   |4.0       |
|R6       |2.43  |3.4       |
|R5B      |1.35  |3.4       |
|R2       |0.5   |1.0       |
|R3-2     |0.6   |3.0       |
|M2-1     

In [119]:
df.filter(col('maxfar') < col('maxfar_new'))\
    .groupBy('zonedist1')\
    .agg(count('zonedist1').alias('counts'))\
    .sort(col('counts').desc()).show(df.count(), False)

+---------+------+
|zonedist1|counts|
+---------+------+
|R8A      |591   |
|M1-1     |133   |
|R3-2     |79    |
|R6B      |71    |
|R3A      |66    |
|R5       |59    |
|PARK     |52    |
|R4       |51    |
|C8-1     |51    |
|R6       |43    |
|C8-2     |36    |
|R3X      |36    |
|M1-4/R8A |20    |
|R4A      |20    |
|R4B      |19    |
|M1-6/R9  |18    |
|R5B      |18    |
|R2       |13    |
|R7-1     |11    |
|C8-3     |11    |
|R6A      |10    |
|M1-2     |10    |
|R3-1     |8     |
|R4-1     |7     |
|M1-4/R7X |7     |
|M1-4/R7D |5     |
|R2A      |4     |
|M1-5     |4     |
|M1-4     |3     |
|C8-4     |2     |
|R7-2     |2     |
|R8B      |2     |
|M1-1/R6A |2     |
|M3-1     |2     |
|R8       |1     |
|M1-4D    |1     |
|R10      |1     |
|R7A      |1     |
|R5D      |1     |
|M2-1     |1     |
+---------+------+



In [110]:
df.select('residfar_new', 'commfar_new', 'pctunbuilt', 'pctunbuilt_new')\
  .filter(col('pctunbuilt') > col('pctunbuilt_new')).agg(count('pctunbuilt_new')).show()

+---------------------+
|count(pctunbuilt_new)|
+---------------------+
|               304494|
+---------------------+



In [122]:
df.filter(col('zonedist1').like('R%'))\
   .select('maxfar_new', 'maxfar', 'builtfar')\
   .agg(mean('maxfar_new'),mean('maxfar'),mean('builtfar')).show()

+------------------+------------------+------------------+
|   avg(maxfar_new)|       avg(maxfar)|     avg(builtfar)|
+------------------+------------------+------------------+
|1.1085407715997209|1.1666834392287233|0.8391015326831988|
+------------------+------------------+------------------+



In [97]:
# By Borough Comparison
df.groupBy('borough')\
    .agg(count(col('pctunbuilt_new') >= 0.5).alias('softsites_new'),
         count(col('pctunbuilt') >= 0.5).alias('softsites'))\
    .withColumn('diff', col('softsites_new')-col('softsites'))\
    .sort(col('diff').desc()).show()

+-------+-------------+---------+----+
|borough|softsites_new|softsites|diff|
+-------+-------------+---------+----+
|     SI|        95252|    95226|  26|
|     MN|        14265|    14246|  19|
|     QN|       268405|   268392|  13|
|     BK|       229191|   229183|   8|
|     BX|        64052|    64049|   3|
+-------+-------------+---------+----+



In [94]:
df.groupBy('cd')\
    .agg(count(col('pctunbuilt_new') >= 0.5).alias('softsites_new'),
         count(col('pctunbuilt') >= 0.5).alias('softsites'))\
    .withColumn('diff', col('softsites_new')-col('softsites'))\
    .sort(col('diff').desc()).show()

+---+-------------+---------+----+
| cd|softsites_new|softsites|diff|
+---+-------------+---------+----+
|503|        37489|    37467|  22|
|111|         1514|     1496|  18|
|302|         5126|     5122|   4|
|595|           18|       16|   2|
|307|        11232|    11230|   2|
|404|         9101|     9099|   2|
|305|        17627|    17625|   2|
|413|        37981|    37979|   2|
|484|           10|        8|   2|
|501|        31319|    31317|   2|
|411|        20874|    20872|   2|
|409|        18288|    18287|   1|
|208|         2589|     2588|   1|
|414|         9996|     9995|   1|
|408|        15945|    15944|   1|
|481|            6|        5|   1|
|110|         2056|     2055|   1|
|407|        28144|    28143|   1|
|103|          899|      898|   1|
|201|         2350|     2349|   1|
+---+-------------+---------+----+
only showing top 20 rows

