In [0]:
#All imports go here
from shapely.geometry import shape
from shapely.ops import transform
from shapely.geometry import MultiPolygon
from shapely import wkt
from shapely.ops import transform

from functools import partial
import pyproj

import pyspark.sql.functions as pys
from pyspark.sql.types import DoubleType, StringType, FloatType, IntegerType, BooleanType
from pyspark.sql.functions import udf
from pyspark.sql.functions import sum
from pyspark.sql.functions import col

# import spark_df_profiling
      

In [0]:
root = ""
fieldDir = root + "field/*.json"
growerDir = root + "grower/*.json"
countriesDir = root + "country/*.json"
sampleDir = root + "sample_event/*.json"

In [0]:
fieldMain = sqlContext.read.json(fieldDir).select('field.id', 'field.boundary_map.area', 'field.boundary_map.boundary','field.associations.`agrian.grower`', 'field.associations.`agrian.farm`').withColumnRenamed('agrian.farm','farm_id').withColumnRenamed('agrian.grower','grower_id')

In [0]:
fieldMain.printSchema()

In [0]:
#Function to convert MultiPoly to Area in Acre
def getAreaOfMultiPoly(columnElement):
    try:
        s = unicode(str(columnElement), "utf-8")
        p = wkt.loads(s)
        m = MultiPolygon(p)
        
        project = partial(
        pyproj.transform,
        pyproj.Proj(init='epsg:4326'), # source coordinate system
        pyproj.Proj(init='epsg:26913')) # destination coordinate system

        m2 = transform(project, m)
        return m2.area/4046.86
    except:
        return 0
    
#Check if field contains non empty boundary data
def isBoundaryPresent(columnElement):
    if not columnElement:
        return False
    try:
        columnElement = s = unicode(str(columnElement), "utf-8")
        if columnElement == "MULTIPOLYGON EMPTY" or columnElement == None or columnElement == "" or columnElement == " ":
            return False
        else:
            return True
    except:
        return False
    
    
#Function to calculate % difference in area calculated and area in JSON
def calculateDifferencePercentageInArea(area, areaCalculated):
    try:
        return (area-areaCalculated)*100/areaCalculated
    except:
        return 0

getAreaOfMultiPoly_udf = udf(lambda z: getAreaOfMultiPoly(z), DoubleType())
getAreaDifferencePercentage = udf(lambda (x,y): calculateDifferencePercentageInArea(float(x),float(y)), DoubleType())
checkIfBoundaryPresent = udf(lambda z: isBoundaryPresent(z), BooleanType())



In [0]:
fieldMain = fieldMain.withColumn('isBoundaryPresent', checkIfBoundaryPresent(fieldMain.boundary))

In [0]:
fieldMain = fieldMain.withColumn("grower_id", pys.explode(fieldMain.grower_id))
fieldMain = fieldMain.withColumn("farm_id", pys.explode(fieldMain.farm_id))

In [0]:
countries = sqlContext.read.json(countriesDir)
countries = countries.select('country.id','country.alpha3')

growers = sqlContext.read.json(growerDir)
growers = growers.select('grower.id', 'grower.address.country_id').withColumnRenamed('id','grower_id_temp')

growerToCountryMap = growers.join(countries, growers.country_id==countries.id).drop('country_id', 'id').withColumnRenamed('alpha3','country')
growerToCountryMap.show()
#Free the memory!
growers = None
countries = None


In [0]:
fieldMain = fieldMain.join(growerToCountryMap, growerToCountryMap.grower_id_temp == fieldMain.grower_id, 'left_outer').drop('grower_id_temp')
growerToCountryMap = None

In [0]:
sample = sqlContext.read.json(sampleDir)
sample = sample.select('sample_event.id', 'sample_event.associations.`agrian.field`', 'sample_event.sample_event_type').toDF('field_id_temp','sample_event_id', 'event_type')
sample = sample.groupBy('field_id_temp').agg(pys.count("event_type").alias("sample_event_count"))


In [0]:
fieldMain = fieldMain.join(sample, sample.field_id_temp == fieldMain.id, 'left_outer').drop('field_id_temp')
fieldMain.printSchema()


In [0]:
fieldMain.coalesce(1).write.format("json").save("fieldSummary/*.json")

In [0]:
processedJson = sqlContext.read.json("fieldSummary.json/*.json")

In [0]:
processedJson.show()

In [0]:
processedJson.filter(processedJson.area.isNotNull()).count()


In [0]:
processedJson.filter(processedJson.area.isNull()).count()

In [0]:
def getDigitVal(value):
    if not value:
        return "0"
        
    if value.isdigit:
        return value
        
    return "0"

is_digit_udf = udf(lambda z: getDigitVal(z), StringType())

processedJson2 = processedJson.withColumn('area', is_digit_udf(processedJson['area']))
processedJson2.show()

In [0]:
total = processedJson2.filter(processedJson2.area.isNotNull()).withColumn("area2", processedJson2.area.cast('float'))
total.select(total.area2).count()

In [0]:
total = total.select("area2")
total.describe().show()

In [0]:
totalError = total.filter("area2 == 0")
totalSmall100 = total.filter("area2 <= 100")
totalSmall400 = total.filter("area2 <= 400").filter("area2 > 100")
totalSmall800 = total.filter("area2 <= 800").filter("area2 > 400")
totalSmall1200 = total.filter("area2 <= 1200").filter("area2 > 800")
totalBig1200 = total.filter("area2 > 1200").filter("area2 <= 2000")
totalBig2000 = total.filter("area2 > 2000")

In [0]:
print "Error processing"
totalError.describe().show()
print "Less than 100"
totalSmall100.describe().show()
print "Between 100 and 400"
totalSmall400.describe().show()
print "Between 400 and 800"
totalSmall800.describe().show()
print "Between 800 and 1200"
totalSmall1200.describe().show()
print "Between 1200 and 2000"
totalBig1200.describe().show()
print "Above 2000"
totalBig2000.describe().show()