# Map Data Prototyping

## Setup

In [None]:
import io
import json
import os
import time

In [None]:
import pandas
import geocoder
import folium
import folium.plugins
from tqdm.auto import tqdm
import requests
import boto3
import numpy

In [None]:
tqdm.pandas()

## Reading Data

In [None]:
config = json.load(open("../creds/config.json"))

In [None]:
object_name = config["data_object_name"]
bucket_name = config["data_bucket_name"]
s3_url = f"s3://{bucket_name}/{object_name}"

In [None]:
s3 = boto3.client("s3")
s3_obj = s3.get_object(Bucket=bucket_name, Key=object_name)

In [None]:
data_df = pandas.read_parquet(io.BytesIO(s3_obj['Body'].read()))

In [None]:
data_df.columns

## Geocoding

Doing this on a location basis, as this is more efficient that working on the data more directly.

In [None]:
data_df["Master 1 Location"].value_counts()

In [None]:
data_df["Master 1 Location Override"].value_counts()

In [None]:
locations = (
    set(data_df["Master 1 Location"].unique()) | 
    set(data_df["Master 1 Location Override"].unique())
)

Using two, free-to-access geocoding services, namely OSM and ArcGIS (isn't ESRI nice?).

In [None]:
#za_geo_bounds = ((16.3449768409, -34.8191663551), (32.830120477, -22.0913127581))
cape_geo_bounds = [[16, -35.077], [30, -30]]

In [None]:
def lookup_loc(gc, gc_session, loc_string, bounds):
    time.sleep(0.1)
    new_loc = gc(loc_string, session=gc_session, timeout=10)
    
    if new_loc.ok and ((bounds[0][0] <= new_loc.x <= bounds[1][0]) and 
                       (bounds[0][1] <= new_loc.y <= bounds[1][1])):
        return new_loc
    else:
        return None

In [None]:
location_lookup = {}

with requests.Session() as osm_session, requests.Session() as ag_session:
    for loc in tqdm(locations, desc="Place Lookup"):
        if loc not in {None, '', 'None'}:
            for gc, session in ((geocoder.arcgis, ag_session),
                                (geocoder.osm, osm_session), 
                                ):
                loc_string = loc.strip()
                current_loc = location_lookup.get(loc, None)
                new_loc = current_loc
                # First, try geocoding the straight string
                if new_loc is None:
                    new_loc = lookup_loc(gc, session, loc_string, cape_geo_bounds)
                    
                # Try titled version
                if new_loc is None and loc_string != loc_string.title():
                    new_loc = lookup_loc(gc, session, loc_string.title(), cape_geo_bounds)
                    
                # Next, try with ", South Africa"
                if new_loc is None and "South Africa" not in loc_string:
                    new_loc_string = f"{loc_string}, South Africa"
                    new_loc = lookup_loc(gc, session, new_loc_string, cape_geo_bounds)
                    
                # Yay, location updated
                if new_loc is not None:
                    location_lookup[loc] = new_loc
            
            if loc not in location_lookup:
                print(f"lookup for '{loc}' failed...")

In [None]:
data_df["Master1Location"] = data_df["Master 1 Location"].progress_apply(
    lambda loc: location_lookup.get(loc, None)
)

In [None]:
override_locations = data_df["Master 1 Location Override"].progress_apply(
    lambda loc: location_lookup.get(loc, None)
)

data_df.Master1Location = override_locations.where(
    override_locations.notna(),
    data_df.Master1Location
)

Check to see how we're doing:

In [None]:
"{:.2%}".format(data_df.Master1Location.notna().sum()/data_df.Master1Location.shape[0])

## Map Time

### Map Setup

In [None]:
m = folium.Map(
    location=[-32, 24],
    tiles=None,
    attr="Me!",
    starting_zoom=4,
    min_zoom=4,
    #max_zoom=10,
)

Used maptiler software to convert scan of 19th Century Cape map into Tile server files.

Use [this site](https://epsg.io/transform#s_srs=4326&t_srs=3857&x=16.0000000&y=-35.0770000) to convert map bounds 

In [None]:
folium.TileLayer(
    #"file:///home/gordon/workspace/cfs-map/resources/cfs-map-tiles4/{z}/{x}/{y}.png",
    'https://cfs-map-output.s3-eu-west-1.amazonaws.com/cfs-map-tiles/{z}/{x}/{y}.png', 
    attr="Me!",
    name='19th Century Cape',
    min_zoom=4,
    #max_zoom=8,
    #max_native_zoom=8
).add_to(m)

In [None]:
folium.TileLayer(
    'CartoDB positron',
    name='Modern Map',
    #min_zoom=8
).add_to(m)

### Individual Markers

Going to use marker clusters for now...

In [None]:
# markers = data_df[
#     data_df.Master1Location.notna() & (data_df.Gender == "M")
# ].progress_apply(
#     lambda row: (
#         folium.Marker(
#             location=[row.Master1Location.y, 
#                       row.Master1Location.x],
#             popup=f"{row.Name} ({row['Master 1 Location']})",
#             icon=folium.Icon(icon='male', prefix='fa', color='blue')
            
#         )
#     ),
#     axis=1
# )
# print(markers.shape)

# markers = pandas.concat([markers, data_df[
#     data_df.Master1Location.notna() & (data_df.Gender == "F")
# ].progress_apply(
#     lambda row: (
#         folium.Marker(
#             location=[row.Master1Location.y, 
#                       row.Master1Location.x],
#             popup=f"{row.Name} ({row['Master 1 Location']})",
#             icon=folium.Icon(icon='female', prefix='fa', color='red'),
#         )
#     ),
#     axis=1
# )])
# print(markers.shape)

In [None]:
# for marker in markers:
#     marker.add_to(m)

### Marker Clusters

In [None]:
job_icon_lookup = {
    "farmworker": "leaf",
    "farmer": "leaf",
    "domestic": "home",
    "servant": "home",
    "sheperd": "leaf",
    "baker's apprentice": "shopping-basket",
    "farm labourer": "leaf",
    "groomsman": "leaf",
    "store servant": "shopping-basket",
    "domestic and gardener": "home",
    "gardener": "leaf",
    "farm servant": "leaf",
    "stableboy": "leaf",
    "house servant": "home",
    "blacksmith's apprentice": "shopping-basket",
    "bakers apprentice": "shopping-basket",
}

In [None]:
data_df["Position1Icon"] = data_df["Profession 1"].progress_apply(
    lambda job: (
        job_icon_lookup.get(
            job.lower().strip() if job else None,
            "male")
    )
)

In [None]:
colour_lookup = {
    "m": "blue",
    "f": "red",
}

In [None]:
def get_clusters(data_df, marker_options={}):
    locations = data_df.apply(
            lambda row: [row.Master1Location.y, row.Master1Location.x],
            axis=1
        ).values.tolist()

    popups = data_df.apply(
        lambda row: folium.map.Popup(
            html= "<br>".join([
                f"<strong>{col.title()}</strong>: {row[col]}"
                for col in (
                    'Name', 'Gender', 'DOB', 'Arrival Date', 'Ship',
                    'Master 1 Name', 'Master 1 occupation', 'Master 1 Location',
                    'Profession 1', 'Orphan', 'Any living relatives'
                )
                if row[col] != None and row[col] != ""
            ]),
            parse_html = False, max_width=200
        ),
        axis=1
    ).values.tolist()

    icons = data_df.apply(
        lambda row: (
            folium.Icon(
                icon=row.Position1Icon,
                prefix='fa',
                color=colour_lookup.get(
                    row.Gender.lower().strip() if row.Gender else None, 
                    "beige")
            )
        ),
        axis=1
    ).values.tolist()
    
    return folium.plugins.MarkerCluster(
        locations, popups, icons,
        **marker_options
    )

### Organising Clusters into Years 

In [None]:
year_set = pandas.Series(
    pandas.to_numeric(data_df['Arrival Date'].unique(), errors='coerce')
).dropna().astype(int)

In [None]:
def get_year_clusters(cumulative=False, include_missing=False):
    clusters = []
    for year in sorted(year_set.values):
        arrival_years = pandas.to_numeric(data_df["Arrival Date"], errors='coerce')
        
        year_filter = arrival_years <= year if cumulative else arrival_years == year
        df_filter = data_df.Master1Location.notna() & year_filter
        
        clusters += [
            (str(year), get_clusters(data_df[df_filter], {"name": str(year)}))
        ]
    
    if include_missing:
        df_filter = (data_df.Master1Location.notna() & 
                 (~pandas.to_numeric(data_df["Arrival Date"], errors='coerce').isin(year_set)))
        clusters += [("?", get_clusters(data_df[df_filter], {"name": "?"}))]
    
    return clusters

#### All Entries

In [None]:
all_entries = get_clusters(
    data_df[data_df.Master1Location.notna()],
    {
        "name": "All Entries",
        "overlay": True,
        "show": False
    }
)
all_entries.add_to(m)

#### Year Entries

In [None]:
for year, cluster in get_year_clusters(include_missing=True):
    cluster.show = False
    cluster.add_to(m)

#### Story Map

In [None]:
story_map = folium.plugins.StoryMap(get_year_clusters(cumulative=True), pan_zoom=6, 
                                    name="Timeline", overlay=True, control=True)
story_map.add_to(m)

### Image Overlay

Using a tile server instead.

In [None]:
#img_path = os.path.join("../dist/", "SA_18Century_Cropped.png")

In [None]:
#correction = (-0.085, 0.115)

In [None]:
# img = folium.raster_layers.ImageOverlay(
#     name='19th Century Map',
#     image=img_path,
#     bounds=[[-35.077 + correction[0], 16 + correction[1]], 
#             [[-30 - correction[0], 30 - correction[1]]]],
#     interactive=True,
#     cross_origin=False,
#     zindex=1,
# )

# img.add_to(m)

### Output

In [None]:
folium.LayerControl(collapsed=False).add_to(m)

In [None]:
output_path = '../dist/map_test_with_zoom_with_img.html'

In [None]:
m.save(output_path)

In [None]:
config = json.load(open("../creds/config.json"))

In [None]:
object_name = config["output_object_name"]
bucket_name = config["output_bucket_name"]
s3_url = f"s3://{bucket_name}/{object_name}"

In [None]:
s3.put_object(
    ACL='public-read',
    Body=open(output_path, 'rb'),
    Bucket=bucket_name,
    Key=object_name,
    ContentType='text/html',
)

In [None]:
m