In [1]:
from urllib.request import Request, urlopen
import pandas as pd

In [2]:
# in some cases a pattern like the following can be used to read data into dbfs for read

# import urllib.request
# urllib.request.urlretrieve("https://address.com/data.csv.gz","/tmp/data.csv.gz")
# dbutils.fs.mv("file:/tmp/data.csv.gz","dbfs:/data/data.csv.gz")
# df = spark.read.format('csv').load("dbfs:/data/data.csv.gz")

# but if it gives a 403  error we can drop back to importing via pandas

In [3]:
# formulate a request
req = Request("https://simplemaps.com/static/data/country-cities/gb/gb.csv")
req.add_header(
  'User-Agent', 
  'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'
)
content = urlopen(req)
 
# use the request to populate a dataframe and convert to a spark dataframe
pandas_df = pd.read_csv(content)
df = spark.createDataFrame(pandas_df)
 
# some towns in the data do not have a population stated
# lets assume they are smaller towns and fill population in as 5000
df = df.fillna(5000)

In [4]:
display(df)

city,lat,lng,country,iso2,admin,capital,population,population_proper
London,51.514248,-0.093145,United Kingdom,GB,,primary,8567000.0,7421209.0
Birmingham,52.466667,-1.916667,United Kingdom,GB,Birmingham,admin,2285000.0,984333.0
Manchester,53.5,-2.216667,United Kingdom,GB,Manchester,admin,2230000.0,395515.0
Leeds,53.8,-1.583333,United Kingdom,GB,Leeds,admin,1529000.0,455123.0
Sheffield,53.36666700000001,-1.5,United Kingdom,GB,Sheffield,admin,1292900.0,552700.0
Glasgow,55.833333,-4.25,United Kingdom,GB,Glasgow City,admin,1160000.0,610268.0
Newcastle upon Tyne,54.98805600000001,-1.619444,United Kingdom,GB,Newcastle upon Tyne,admin,882000.0,192382.0
Nottingham,52.966667,-1.166667,United Kingdom,GB,Nottingham,admin,825600.0,305700.0
Liverpool,53.416667,-3.0,United Kingdom,GB,Liverpool,admin,811000.0,468945.0
Southend-on-Sea,51.533333,0.7,United Kingdom,GB,Southend-on-Sea,admin,618386.0,173600.0


In [5]:
from pyspark.sql.functions import col
 
data = ",\n".join(
  map(
    lambda row: "[{}, {}, {}]".format(
      row[0], row[1], row[2]
    ), df.select(
      col("lat"),col("lng"),col("population")/1000
    ).collect()
  )
)

In [6]:
displayHTML("""
<html>
<head>
 <link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.1/dist/leaflet.css"
   integrity="sha512-Rksm5RenBEKSKFjgI3a41vrjkw4EVPlJ3+OiI65vTjIdo9brlAacEuKOiQ5OFh7cOI1bkDwLqdLw3Zg0cRJAAQ=="
   crossorigin=""/>
 <script src="https://unpkg.com/leaflet@1.3.1/dist/leaflet.js"
   integrity="sha512-/Nsx9X4HebavoBvEBuyp3I7od5tA0UzAxs+j83KgC8PU0kgB4XiK4Lfe4y4cgBtaRJQEIFCW+oC506aPT2L1zw=="
   crossorigin=""></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/leaflet.heat/0.2.0/leaflet-heat.js"></script>
</head>
<body>
  <div id="uk_map_id" style="width:768px; height:1024px"></div>
  <script>
    var uk_map = L.map('uk_map_id').setView([55,-5], 6);
    var tiles = L.tileLayer('http://{s}.tile.osm.org/{z}/{x}/{y}.png', {
      attribution: '&copy; <a href="http://osm.org/copyright">OpenStreetMap</a> contributors',
    }).addTo(uk_map);
    var heat = L.heatLayer([""" + data + """], {radius: 30}).addTo(uk_map);
  </script>
</body>
</html>
""")