In [1]:
#Uses Shapely and Spark datadrame UDFs to calculate Acreage per field
from shapely.geometry import shape
from shapely.ops import transform
import pyspark.sql.functions as pys
from pyspark.sql.types import DoubleType, StringType, FloatType
from shapely.geometry.multipolygon import MultiPolygon
from shapely import wkt
from functools import partial
import pyproj
from shapely.ops import transform
from pyspark.sql.functions import udf

In [2]:
#Read Fields data here
df = sqlContext.read.json("/Users/jay_lohokare/Desktop/data/fields_min/*.json")
df.count()

100

In [3]:
#Remove unwanted data from memory - Replace df in place
# df.printSchema()
df = df.select('field.boundary_map.boundary', 'field.boundary_map.area', 'field.id')


In [4]:
df.printSchema()

root
 |-- boundary: string (nullable = true)
 |-- area: double (nullable = true)
 |-- id: string (nullable = true)



In [5]:
df.show()
df.count()

+--------------------+-----------------+--------------------+
|            boundary|             area|                  id|
+--------------------+-----------------+--------------------+
|MULTIPOLYGON (((-...| 264.255320466716|0002c048-6458-42c...|
|MULTIPOLYGON (((-...|134.6697950943768|0008a146-6e9d-444...|
|MULTIPOLYGON (((-...|84.27592550818439|00037e5e-e622-46c...|
|MULTIPOLYGON (((-...|121.8597418598638|00016eb5-58c7-4c0...|
|MULTIPOLYGON (((-...|66.37186503375712|00049b91-a69c-4dc...|
|MULTIPOLYGON (((-...|33.44286858832247|00099882-327e-4ce...|
|MULTIPOLYGON (((-...|55.29469330580098|0005e9cd-c8cb-11e...|
|MULTIPOLYGON (((-...|178.8495719683028|00068bf1-71bb-46d...|
|MULTIPOLYGON (((-...|24.40396587516312|00080061-0a79-4dd...|
|MULTIPOLYGON (((-...| 60.2521407292888|00061b3c-613d-490...|
|MULTIPOLYGON (((-...|51.39521819118318|000844dd-18f0-11e...|
|MULTIPOLYGON (((-...|39.55388604634178|0003e2d4-886a-429...|
|MULTIPOLYGON (((-...|28.11350183609296|000803a0-a882-488...|
|MULTIPO

100

In [6]:
#Filtering out Fields without boundary data
df = df.filter("boundary is not NULL")
df = df.filter(df.boundary!='MULTIPOLYGON EMPTY')
df.count()


80

In [7]:
#Function to convert MultiPoly to Area in Acre
def getAreaOfMultiPoly(columnElement):
    try:
        s = unicode(str(columnElement), "utf-8")
        p = wkt.loads(s)
        m = MultiPolygon(p)
        
        project = partial(
        pyproj.transform,
        pyproj.Proj(init='epsg:4326'), # source coordinate system
        pyproj.Proj(init='epsg:26913')) # destination coordinate system

        m2 = transform(project, m)
        return m2.area/4046.86
    except:
        return 0

In [8]:
#Function to calculate % difference in area calculated and area in JSON
def calculateDifferencePercentageInArea(area, areaCalculated):
    return (area-areaCalculated)*100/areaCalculated
    

In [9]:
#Calculation functions are called on data here
getAreaOfMultiPoly_udf = udf(lambda z: getAreaOfMultiPoly(z), DoubleType())
getAreaDifferencePercentage = udf(lambda (x,y): calculateDifferencePercentageInArea(float(x),float(y)), DoubleType())

df = df.select("boundary", "area", "id", getAreaOfMultiPoly_udf("boundary").alias("areaCalculated"))
df = df.select("boundary", "area", "id", "areaCalculated", pys.abs(calculateDifferencePercentageInArea(df.area, df.areaCalculated)).alias("percentageDifference"))

In [10]:
df.show()

+--------------------+-----------------+--------------------+------------------+--------------------+
|            boundary|             area|                  id|    areaCalculated|percentageDifference|
+--------------------+-----------------+--------------------+------------------+--------------------+
|MULTIPOLYGON (((-...| 264.255320466716|0002c048-6458-42c...|268.32633898152324|  1.5171893039868751|
|MULTIPOLYGON (((-...|134.6697950943768|0008a146-6e9d-444...|139.31984324452614|  3.3376782817561996|
|MULTIPOLYGON (((-...|84.27592550818439|00037e5e-e622-46c...|  91.7620761361809|   8.158218452780616|
|MULTIPOLYGON (((-...|121.8597418598638|00016eb5-58c7-4c0...|122.43132802502753|  0.4668626685539819|
|MULTIPOLYGON (((-...|66.37186503375712|00049b91-a69c-4dc...| 69.20161054494156|   4.089132447787066|
|MULTIPOLYGON (((-...|33.44286858832247|00099882-327e-4ce...|38.875056133319525|   13.97345260767654|
|MULTIPOLYGON (((-...|55.29469330580098|0005e9cd-c8cb-11e...| 57.53925610071283|  

In [11]:
#Get fields with % acres difference more than 10%, sort Descending
df.filter("`percentageDifference` >= 10").sort('percentageDifference', ascending=False).show()
df.select('*')

+--------------------+-----------------+--------------------+------------------+--------------------+
|            boundary|             area|                  id|    areaCalculated|percentageDifference|
+--------------------+-----------------+--------------------+------------------+--------------------+
|MULTIPOLYGON (((-...|62.71070195529653|00065eb6-e80c-42d...| 75.41700947643952|  16.848065985846965|
|MULTIPOLYGON (((-...|3.885518367439746|00053764-779a-4e4...| 4.582765449471536|    15.2145487199698|
|MULTIPOLYGON (((-...|5.635780159205062|0006c04d-421b-497...| 6.641187698243527|  15.138971893602363|
|MULTIPOLYGON (((-...|68.85053831693533|00042f18-1a71-11e...| 80.77034276443136|  14.757649948645668|
|MULTIPOLYGON (((-...|3.745725217653795|00066dec-adc1-40f...| 4.388034833818856|  14.637751077424017|
|MULTIPOLYGON (((-...|9.678924411441011|000820cc-c6aa-11e...|11.337086184307019|  14.625996009109134|
|MULTIPOLYGON (((-...|2.872533651999968|00047884-019c-4f4...| 3.346020131628966|  

DataFrame[boundary: string, area: double, id: string, areaCalculated: double, percentageDifference: double]