In [9]:
#All imports go here
#pip install shapely
#pip install spark-df-profiling

from shapely.geometry import shape
from shapely.ops import transform
import pyspark.sql.functions as pys

from pyspark.sql.types import IntegerType, DoubleType, StringType
from shapely.geometry.multipolygon import MultiPolygon
from shapely.geometry import Point
from shapely import wkt

from pyspark.sql.functions import udf
import spark_df_profiling
      

In [2]:
#Read Fields data here
field = sqlContext.read.json("Box Sync/Nutrien_Jay_Share/checkIfFieldContainsSampleGeoLocation/fields/*.json")
field = field.select('field.boundary_map.boundary', 'field.id').withColumnRenamed('boundary','field_boundary')
field.show()

+--------------------+--------------------+
|      field_boundary|                  id|
+--------------------+--------------------+
|MULTIPOLYGON (((-...|7361a746-f379-11e...|
|MULTIPOLYGON (((-...|0e143638-241d-4f5...|
|MULTIPOLYGON (((-...|c5fdbde2-a4d0-447...|
|MULTIPOLYGON (((-...|a01838af-2607-11e...|
|MULTIPOLYGON (((-...|7efbd6d2-f379-11e...|
|MULTIPOLYGON (((-...|73ba46a1-f379-11e...|
|MULTIPOLYGON (((-...|5129bbd5-8a02-11e...|
+--------------------+--------------------+



In [3]:
#Read Fields data here
sample = sqlContext.read.json("Box Sync/Nutrien_Jay_Share/checkIfFieldContainsSampleGeoLocation/sample_event/*.txt")
sample = sample.withColumn('field_id', pys.explode('sample_event.associations.`agrian.field`'))
sample = sample.select('field_id', 'sample_event.centroid', 'sample_event.sample_event_type').withColumnRenamed('centroid','sample_centroid')
# sample = sample.withColumn('sample_centroid', sample.centroid )
sample.show()


+--------------------+--------------------+-----------------+
|            field_id|     sample_centroid|sample_event_type|
+--------------------+--------------------+-----------------+
|c5fdbde2-a4d0-447...|POINT (-94.075156...|             soil|
|7361a746-f379-11e...|POINT (-86.967842...|             soil|
|73ba46a1-f379-11e...|POINT (-89.338422...|             soil|
|0e143638-241d-4f5...|POINT (-93.188290...|             soil|
|7efbd6d2-f379-11e...|POINT (-77.490143...|             soil|
|5129bbd5-8a02-11e...|POINT (-82.891887...|             soil|
|a01838af-2607-11e...|POINT (-88.540064...|             soil|
+--------------------+--------------------+-----------------+



In [4]:
#Join the data on field_id
fieldAndSample = sample.join(field, sample.field_id == field.id).drop('id')
fieldAndSample.show()

+--------------------+--------------------+-----------------+--------------------+
|            field_id|     sample_centroid|sample_event_type|      field_boundary|
+--------------------+--------------------+-----------------+--------------------+
|c5fdbde2-a4d0-447...|POINT (-94.075156...|             soil|MULTIPOLYGON (((-...|
|7361a746-f379-11e...|POINT (-86.967842...|             soil|MULTIPOLYGON (((-...|
|73ba46a1-f379-11e...|POINT (-89.338422...|             soil|MULTIPOLYGON (((-...|
|0e143638-241d-4f5...|POINT (-93.188290...|             soil|MULTIPOLYGON (((-...|
|7efbd6d2-f379-11e...|POINT (-77.490143...|             soil|MULTIPOLYGON (((-...|
|5129bbd5-8a02-11e...|POINT (-82.891887...|             soil|MULTIPOLYGON (((-...|
|a01838af-2607-11e...|POINT (-88.540064...|             soil|MULTIPOLYGON (((-...|
+--------------------+--------------------+-----------------+--------------------+



In [5]:
def findIfPointInMultiPoly(point, MultiPoly):
    if MultiPoly == "" or point == "POINT EMPTY" or MultiPoly == "MULTIPOLYGON EMPTY" or point == "" or point == None or MultiPoly == None:
        return "Data missing"
    try:
        m = unicode(str(MultiPoly), "utf-8")
        m2 = wkt.loads(m)
        multiPoly = MultiPolygon(m2)
        
        p = unicode(str(point), "utf-8")
        p2 = wkt.loads(p)
        point = Point(p2)
        
        if multiPoly.contains(point):
            return "1"
        else:
            return "0"
    except:
        return "Error"

findIfPointInMultiPolyUdf = udf(lambda x, y: findIfPointInMultiPoly(x, y), StringType())



In [6]:
fieldAndSample.show()

+--------------------+--------------------+-----------------+--------------------+
|            field_id|     sample_centroid|sample_event_type|      field_boundary|
+--------------------+--------------------+-----------------+--------------------+
|c5fdbde2-a4d0-447...|POINT (-94.075156...|             soil|MULTIPOLYGON (((-...|
|7361a746-f379-11e...|POINT (-86.967842...|             soil|MULTIPOLYGON (((-...|
|73ba46a1-f379-11e...|POINT (-89.338422...|             soil|MULTIPOLYGON (((-...|
|0e143638-241d-4f5...|POINT (-93.188290...|             soil|MULTIPOLYGON (((-...|
|7efbd6d2-f379-11e...|POINT (-77.490143...|             soil|MULTIPOLYGON (((-...|
|5129bbd5-8a02-11e...|POINT (-82.891887...|             soil|MULTIPOLYGON (((-...|
|a01838af-2607-11e...|POINT (-88.540064...|             soil|MULTIPOLYGON (((-...|
+--------------------+--------------------+-----------------+--------------------+



In [7]:
fieldAndSample = fieldAndSample.select('*', findIfPointInMultiPolyUdf(fieldAndSample.sample_centroid, fieldAndSample.field_boundary).alias('sampleInField'))


In [8]:
fieldAndSample.show()

+--------------------+--------------------+-----------------+--------------------+-------------+
|            field_id|     sample_centroid|sample_event_type|      field_boundary|sampleInField|
+--------------------+--------------------+-----------------+--------------------+-------------+
|c5fdbde2-a4d0-447...|POINT (-94.075156...|             soil|MULTIPOLYGON (((-...|            1|
|7361a746-f379-11e...|POINT (-86.967842...|             soil|MULTIPOLYGON (((-...|            1|
|73ba46a1-f379-11e...|POINT (-89.338422...|             soil|MULTIPOLYGON (((-...|            1|
|0e143638-241d-4f5...|POINT (-93.188290...|             soil|MULTIPOLYGON (((-...|            1|
|7efbd6d2-f379-11e...|POINT (-77.490143...|             soil|MULTIPOLYGON (((-...|            1|
|5129bbd5-8a02-11e...|POINT (-82.891887...|             soil|MULTIPOLYGON (((-...|            1|
|a01838af-2607-11e...|POINT (-88.540064...|             soil|MULTIPOLYGON (((-...|            0|
+--------------------+--------

In [10]:
report = spark_df_profiling.ProfileReport(fieldAndSample)

In [11]:
report

0,1
Number of variables,5
Number of observations,7
Total Missing (%),0.0%
Total size in memory,0.0 B
Average record size in memory,0.0 B

0,1
Numeric,0
Categorical,1
Date,0
Text (Unique),3
Rejected,1

First 3 values
MULTIPOLYGON (((-93.18610719970256 42.20380540...
MULTIPOLYGON (((-94.08008537356005 42.84875892...
MULTIPOLYGON (((-86.9643916666667 35.306851666...

Last 3 values
MULTIPOLYGON (((-88.5367432375211 38.540527726...
MULTIPOLYGON (((-89.33385700000002 41.73826299...
MULTIPOLYGON (((-82.8945571385563 39.316404024...

0,1
1,MULTIPOLYGON (((-93.18610719970256 42.20380540...
2,MULTIPOLYGON (((-94.08008537356005 42.84875892...
3,MULTIPOLYGON (((-86.9643916666667 35.306851666...
4,MULTIPOLYGON (((-77.4903561858575 35.118930218...
5,MULTIPOLYGON (((-88.5367432375211 38.540527726...
6,MULTIPOLYGON (((-89.33385700000002 41.73826299...
7,MULTIPOLYGON (((-82.8945571385563 39.316404024...

First 3 values
5129bbd5-8a02-11e4-ae2b-3176bd53680f
7efbd6d2-f379-11e3-ae2b-3176bd53680f
c5fdbde2-a4d0-4478-87be-b59a38a3120d

Last 3 values
0e143638-241d-4f5d-b786-90ac81354836
7361a746-f379-11e3-ae2b-3176bd53680f
a01838af-2607-11e5-a33e-a889b4eb7caa

0,1
1,5129bbd5-8a02-11e4-ae2b-3176bd53680f
2,7efbd6d2-f379-11e3-ae2b-3176bd53680f
3,c5fdbde2-a4d0-4478-87be-b59a38a3120d
4,73ba46a1-f379-11e3-ae2b-3176bd53680f
5,0e143638-241d-4f5d-b786-90ac81354836
6,7361a746-f379-11e3-ae2b-3176bd53680f
7,a01838af-2607-11e5-a33e-a889b4eb7caa

0,1
Distinct count,2
Unique (%),28.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
1,6
0,1

Value,Count,Frequency (%),Unnamed: 3
1,6,85.7%,
0,1,14.3%,

First 3 values
POINT (-89.33842242194574 41.738472130318065)
POINT (-94.0751569 42.8524378)
POINT (-77.49014349999999 35.1168925)

Last 3 values
POINT (-86.96784249999999 35.3126025)
POINT (-88.54006488 38.54255189)
POINT (-82.89188741000001 39.31833855000001)

0,1
1,POINT (-89.33842242194574 41.738472130318065)
2,POINT (-94.0751569 42.8524378)
3,POINT (-77.49014349999999 35.1168925)
4,POINT (-93.18829016768389 42.20617441415104)
5,POINT (-86.96784249999999 35.3126025)
6,POINT (-88.54006488 38.54255189)
7,POINT (-82.89188741000001 39.31833855000001)

0,1
Constant value,soil

Unnamed: 0,field_id,sample_centroid,sample_event_type,field_boundary,sampleInField
0,c5fdbde2-a4d0-4478-87be-b59a38a3120d,POINT (-94.0751569 42.8524378),soil,MULTIPOLYGON (((-94.08008537356005 42.84875892...,1
1,7361a746-f379-11e3-ae2b-3176bd53680f,POINT (-86.96784249999999 35.3126025),soil,MULTIPOLYGON (((-86.9643916666667 35.306851666...,1
2,73ba46a1-f379-11e3-ae2b-3176bd53680f,POINT (-89.33842242194574 41.738472130318065),soil,MULTIPOLYGON (((-89.33385700000002 41.73826299...,1
3,0e143638-241d-4f5d-b786-90ac81354836,POINT (-93.18829016768389 42.20617441415104),soil,MULTIPOLYGON (((-93.18610719970256 42.20380540...,1
4,7efbd6d2-f379-11e3-ae2b-3176bd53680f,POINT (-77.49014349999999 35.1168925),soil,MULTIPOLYGON (((-77.4903561858575 35.118930218...,1
