In [1]:
from math import *

def OSGB36toWGS84(E,N):
 #E, N are the British national grid coordinates - eastings and northings
    a, b = 6377563.396, 6356256.909     #The Airy 180 semi-major and semi-minor axes used for OSGB36 (m)
    F0 = 0.9996012717                   #scale factor on the central meridian
    lat0 = 49*pi/180                    #Latitude of true origin (radians)
    lon0 = -2*pi/180                    #Longtitude of true origin and central meridian (radians)
    N0, E0 = -100000, 400000            #Northing & easting of true origin (m)
    e2 = 1 - (b*b)/(a*a)                #eccentricity squared
    n = (a-b)/(a+b)

    #Initialise the iterative variables
    lat,M = lat0, 0
    while N-N0-M >= 0.00001: #Accurate to 0.01mm
        lat = (N-N0-M)/(a*F0) + lat;
        M1 = (1 + n + (5./4)*n**2 + (5./4)*n**3) * (lat-lat0)
        M2 = (3*n + 3*n**2 + (21./8)*n**3) * sin(lat-lat0) * cos(lat+lat0)
        M3 = ((15./8)*n**2 + (15./8)*n**3) * sin(2*(lat-lat0)) * cos(2*(lat+lat0))
        M4 = (35./24)*n**3 * sin(3*(lat-lat0)) * cos(3*(lat+lat0))
        #meridional arc
        M = b * F0 * (M1 - M2 + M3 - M4)

    #transverse radius of curvature
    nu = a*F0/sqrt(1-e2*sin(lat)**2)

    #meridional radius of curvature
    rho = a*F0*(1-e2)*(1-e2*sin(lat)**2)**(-1.5)
    eta2 = nu/rho-1

    secLat = 1./cos(lat)
    VII = tan(lat)/(2*rho*nu)
    VIII = tan(lat)/(24*rho*nu**3)*(5+3*tan(lat)**2+eta2-9*tan(lat)**2*eta2)
    IX = tan(lat)/(720*rho*nu**5)*(61+90*tan(lat)**2+45*tan(lat)**4)
    X = secLat/nu
    XI = secLat/(6*nu**3)*(nu/rho+2*tan(lat)**2)
    XII = secLat/(120*nu**5)*(5+28*tan(lat)**2+24*tan(lat)**4)
    XIIA = secLat/(5040*nu**7)*(61+662*tan(lat)**2+1320*tan(lat)**4+720*tan(lat)**6)
    dE = E-E0

    #These are on the wrong ellipsoid currently: Airy1830. (Denoted by _1)
    lat_1 = lat - VII*dE**2 + VIII*dE**4 - IX*dE**6
    print (lat_1)
    lon_1 = lon0 + X*dE - XI*dE**3 + XII*dE**5 - XIIA*dE**7

    #Want to convert to the GRS80 ellipsoid. 
    #First convert to cartesian from spherical polar coordinates
    H = 0 #Third spherical coord. 
    x_1 = (nu/F0 + H)*cos(lat_1)*cos(lon_1)
    y_1 = (nu/F0+ H)*cos(lat_1)*sin(lon_1)
    z_1 = ((1-e2)*nu/F0 +H)*sin(lat_1)

    #Perform Helmert transform (to go between Airy 1830 (_1) and GRS80 (_2))
    s = -20.4894*10**-6 #The scale factor -1
    tx, ty, tz = 446.448, -125.157, + 542.060 #The translations along x,y,z axes respectively
    rxs,rys,rzs = 0.1502,  0.2470,  0.8421  #The rotations along x,y,z respectively, in seconds
    rx, ry, rz = rxs*pi/(180*3600.), rys*pi/(180*3600.), rzs*pi/(180*3600.) #In radians
    x_2 = tx + (1+s)*x_1 + (-rz)*y_1 + (ry)*z_1
    y_2 = ty + (rz)*x_1  + (1+s)*y_1 + (-rx)*z_1
    z_2 = tz + (-ry)*x_1 + (rx)*y_1 +  (1+s)*z_1

    #Back to spherical polar coordinates from cartesian
    #Need some of the characteristics of the new ellipsoid    
    a_2, b_2 =6378137.000, 6356752.3141 #The GSR80 semi-major and semi-minor axes used for WGS84(m)
    e2_2 = 1- (b_2*b_2)/(a_2*a_2)   #The eccentricity of the GRS80 ellipsoid
    p = sqrt(x_2**2 + y_2**2)

    #Lat is obtained by an iterative proceedure:   
    lat = atan2(z_2,(p*(1-e2_2))) #Initial value
    print ("Lat before iteration", lat)
    latold = 2*pi
    while abs(lat - latold)>10**-16: 
        lat, latold = latold, lat
        nu_2 = a_2/sqrt(1-e2_2*sin(latold)**2)
        lat = atan2(z_2+e2_2*nu_2*sin(latold), p)


    #Lon and height are then pretty easy
    lon = atan2(y_2,x_2)
    H = p/cos(lat) - nu_2


    #Convert to degrees
    lat = lat*180/pi
    lon = lon*180/pi

    #Job's a good'n. 
    return lat, lon

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4,,pyspark,idle,,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
print (OSGB36toWGS84(516276, 173141))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.8978814472874397
Lat before iteration 0.8978904491757984
(51.44533171139635, -0.32824830611195455)

In [3]:
import sys

sys.argv.append('--JOB_NAME')
sys.argv.append('address-cleaning')

sys.argv.append('--source_catalog_database')
sys.argv.append('env-enforcement-raw-zone')

sys.argv.append('--source_catalog_table')
sys.argv.append('noiseworks_case')

sys.argv.append('--cleaned_repairs_s3_bucket_target')
sys.argv.append('s3://dataplatform-stg-refined-zone/housing-repairs/repairs-avonline/cleaned')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import rank, col, trim, when, max, trim
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from awsglue.dynamicframe import DynamicFrame

#TODO get these functions from helpers

def get_glue_env_var(key, default="none"):
    if f'--{key}' in sys.argv:
        return getResolvedOptions(sys.argv, [key])[key]
    else:
        return default

def getLatestPartitions(dfa):
   dfa = dfa.where(col('import_year') == dfa.select(max('import_year')).first()[0])
   dfa = dfa.where(col('import_month') == dfa.select(max('import_month')).first()[0])
   dfa = dfa.where(col('import_day') == dfa.select(max('import_day')).first()[0])
   return dfa

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

source_catalog_database = get_glue_env_var('source_catalog_database', '')
source_catalog_table    = get_glue_env_var('source_catalog_table', '')
cleaned_repairs_s3_bucket_target = get_glue_env_var('cleaned_repairs_s3_bucket_target', '')


sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
logger = glueContext.get_logger()
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

logger.info('Fetch Source Data')

source_data = glueContext.create_dynamic_frame.from_catalog(
    name_space=source_catalog_database,
    table_name=source_catalog_table,
#     push_down_predicate="import_date==max(import_date)"
) 



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
name 'df' is not defined
Traceback (most recent call last):
NameError: name 'df' is not defined



In [16]:
df = df.withColumn('easting',df['easting'].cast("double"))
df = df.withColumn('northing',df['northing'].cast("double"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
df=source_data.toDF()
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+-------------+-----+-------------------+------------+-------+--------+------+--------------------+---------+---------+------+-----------+--------------------+-----------------+-----------+------------+----------+-----------+
| id|             created|created_by_id| kind|         kind_other|        uprn|easting|northing|radius|      location_cache|     ward|    where|estate|assigned_id|     import_datetime| import_timestamp|import_year|import_month|import_day|import_date|
+---+--------------------+-------------+-----+-------------------+------------+-------+--------+------+--------------------+---------+---------+------+-----------+--------------------+-----------------+-----------+------------+----------+-----------+
|680|2021-10-21T08:40:...|             |   tv|                   |            | 532461|  186439|   180|180m around a poi...|E05009369|residence|     y|       1925|2022-02-08 02:01:...|1644285718.432042|       2022|          02|        08|   202202

In [17]:
udfConvert = udf(OSGB36toWGS84, ArrayType(DoubleType()))
df = df.withColumn('latlon', udfConvert(df['easting'], df['northing']))
df = df.withColumn('lat', df['latlon'].getItem(0))
df = df.withColumn('lon', df['latlon'].getItem(1))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- id: string (nullable = true)
 |-- created: string (nullable = true)
 |-- created_by_id: string (nullable = true)
 |-- kind: string (nullable = true)
 |-- kind_other: string (nullable = true)
 |-- uprn: string (nullable = true)
 |-- easting: double (nullable = true)
 |-- northing: double (nullable = true)
 |-- radius: string (nullable = true)
 |-- location_cache: string (nullable = true)
 |-- ward: string (nullable = true)
 |-- where: string (nullable = true)
 |-- estate: string (nullable = true)
 |-- assigned_id: string (nullable = true)
 |-- import_datetime: timestamp (nullable = true)
 |-- import_timestamp: string (nullable = true)
 |-- import_year: string (nullable = true)
 |-- import_month: string (nullable = true)
 |-- import_day: string (nullable = true)
 |-- import_date: string (nullable = true)
 |-- latlon: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)

In [19]:
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+-------------+-----+-------------------+------------+--------+--------+------+--------------------+---------+---------+------+-----------+--------------------+-----------------+-----------+------------+----------+-----------+--------------------+------------------+--------------------+
| id|             created|created_by_id| kind|         kind_other|        uprn| easting|northing|radius|      location_cache|     ward|    where|estate|assigned_id|     import_datetime| import_timestamp|import_year|import_month|import_day|import_date|              latlon|               lat|                 lon|
+---+--------------------+-------------+-----+-------------------+------------+--------+--------+------+--------------------+---------+---------+------+-----------+--------------------+-----------------+-----------+------------+----------+-----------+--------------------+------------------+--------------------+
|680|2021-10-21T08:40:...|             |   tv|               