In [20]:
from pathlib import Path
import pandas as pd
from lxml import etree

In [9]:
PATH_TO_DATASET = Path("datasets/Atlanta_Child_Murders.kml")

In [24]:
tree = etree.parse(PATH_TO_DATASET)
root = tree.getroot()
ns = {'kml': 'http://www.opengis.net/kml/2.2'}


In [28]:

data = []

# 2. Iterate through folders to capture the context of the location
folders = root.xpath('//kml:Folder', namespaces=ns)

for folder in folders:
    folder_name = folder.xpath('./kml:name/text()', namespaces=ns)[0]
    
    # Determine Status and Place type based on the Folder name
    status = "Suspect" if "Wayne Williams" in folder_name else "Victim"
    
    # Map the folder name to a cleaner "Place" tag
    if "home address" in folder_name.lower() or "home" in folder_name.lower():
        place_tag = "Home address"
    elif "last seen" in folder_name.lower():
        place_tag = "Last seen"
    elif "found" in folder_name.lower():
        place_tag = "Body found"
    elif "school" in folder_name.lower():
        place_tag = "School"
    else:
        place_tag = folder_name

    # 3. Iterate through Placemarks within the folder
    placemarks = folder.xpath('./kml:Placemark', namespaces=ns)
    for pm in placemarks:
        pm_name = pm.xpath('./kml:name/text()', namespaces=ns)[0]
        
        # In the Suspect folder, the name is the address, so we attribute it to Wayne Williams
        person_name = "Wayne Williams" if status == "Suspect" else pm_name
        
        # Extract coordinates (Lon, Lat, Alt)
        coords_text = pm.xpath('.//kml:coordinates/text()', namespaces=ns)
        if coords_text:
            coords = coords_text[0].strip().split(',')
            lon = float(coords[0])
            lat = float(coords[1])
        else:
            lon, lat = None, None
            
        data.append({
            "Person": person_name,
            "Status": status,
            "Longitude": lon,
            "Latitude": lat,
            "Place": place_tag
        })

# 4. Create and save the DataFrame
df = pd.DataFrame(data)
df = df.sort_values(by=['Status', 'Person'])
df.to_csv('datasets/atlanta_child_murders_data.csv', index=False)