<a href="https://colab.research.google.com/github/ManasChandan/ai_traffic_routing/blob/AIBRPROD---3-Consolidation-of-External-Datasets/AIBRPROD_19_Road_Width_KML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This notebbok will parse the KML file that has the projected and the buld roadwidth for the entire bangalore city
# Will parse the KML file, keep the relevant details and will use them in the algortihm.

In [2]:
# Necessary Imports

import geopandas as gpd
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

In [3]:
def kml_to_geodataframe_with_extended_data(kml_file):
    """
    Reads a KML file, extracts placemark data including ExtendedData,
    and creates a GeoPandas GeoDataFrame with geometry and properties.

    Args:
        kml_file (str): Path to the KML file.

    Returns:
        geopandas.GeoDataFrame: A GeoDataFrame containing geometry and
                                 properties from the KML file.
    """

    # Read the KML file as XML
    tree = ET.parse(kml_file)
    root = tree.getroot()

    # Extract Placemarks and their properties
    placemarks = root.findall('.//{http://www.opengis.net/kml/2.2}Placemark')
    data = []

    for placemark in placemarks:
        properties = {}

        for extended_data in placemark.findall('{http://www.opengis.net/kml/2.2}ExtendedData'):
            schema_data = extended_data.find('{http://www.opengis.net/kml/2.2}SchemaData')

            for simple_data in schema_data.findall('{http://www.opengis.net/kml/2.2}SimpleData'):
                name = simple_data.get('name')
                value = simple_data.text
                properties[name] = value

        data.append(properties.copy())  # Avoid modifying original dictionary

    # Create a DataFrame containing properties
    associated_data = pd.DataFrame(data)

    # Read geometries from the KML file using GeoPandas
    road_width_geometry = gpd.read_file(kml_file)

    # Ensure both DataFrames have the same number of rows (optional)
    if len(associated_data) != len(road_width_geometry):
        raise ValueError("DataFrames have different row counts. Check KML structure.")

    # Merge DataFrames based on index (assuming unique identifiers in KML)
    final_geom_data = pd.merge(associated_data, road_width_geometry, left_index=True, right_index=True)

    return final_geom_data

In [4]:
# The datastet
road_width = kml_to_geodataframe_with_extended_data(r"blr_roadwidth.kml")

## KML File Analysis

**Placemark:**

- Represents a road or path.
- Contains style, extended data, and geometry.

**ExtendedData:**

- Stores additional information about the road.
- Schema: "roadwidth"
- Attributes:
  - `OBJECTID`: Unique identifier for the feature.
  - `RR_CD`: Road code or identifier.
  - `RR_TP_HIER`: Road type hierarchy.
  - `RR_WIDTH_P`: Planned road width.
  - `RR_TP_TYPE`: Road type.
  - `SHAPE_LENG`: Length of the shape.
  - `RR_width_B`: Built road width.
  - `Shape_Le_1`: Length of the shape (Length of first line segment).
  - `Shape_Le_2`: Length of the shape (Lenght of the second line segment else same as above).

**MultiGeometry:**

- Contains two LineStrings representing different segments of the road.
- Certain ID contain 2 Line segments as well. Separating each other from corssings or turnings.

**LineStrings:**

- Define the geographic path of the road using latitude and longitude coordinates.

In [5]:
# Let us analyse the data and only keep the relevant columns for us
road_width.head(3)

Unnamed: 0,OBJECTID,OBJECTID_1,OBJECTID_2,RR_CD,RR_TP_HIER,RR_WIDTH_P,RR_TP_TYPE,SHAPE_LENG,RR_width_B,Shape_Le_1,Shape_Le_2,SHAPE.STLength(),Name,Description,geometry
0,1,1,1,556,MI,15,C,829.20525621,8,3457.51005989,3457.51005989,3457.51004728507,,,"MULTILINESTRING ((77.47175 12.9608, 77.47134 1..."
1,2,2,3,542,PU,36,C,284.79648966,18,284.79648966,657.60369284,657.603669678116,,,"MULTILINESTRING ((77.64562 13.0508, 77.64562 1..."
2,3,3,4,7,MA,30,C,278.6688142,15,278.6688142,278.6688142,278.668835025454,,,"MULTILINESTRING ((77.51943 12.87659, 77.51938 ..."


Description for the RR_TP_HIER and RR_TP_TYPE

| Code | Description                       |
|------|-----------------------------------|
| MA   | Major roads or highways           |
| MI   | Minor roads or arterial roads     |
| PU   | Primary roads or collector roads  |
| OR   | Other roads                       |
| IR   | Industrial roads                  |
| CR   | City roads or urban roads         |
| PR   | Private roads                     |
| W    | Wide roads or expressways         |
| C    | Collector roads or feeder roads   |


In [6]:
# Based on the collected description we don't need the RR_TP_TYPE, since the HIER is much more descriptive.

# The column RR_WIDTH_P and RR_WIDTH_B, both can be useful, sice the RR_WIDTH_P can be used as a less important preference
# and RR_width_B with higher preference.

In [7]:
road_width["RR_width_B"] = road_width["RR_width_B"].astype(float)

road_width["RR_WIDTH_P"] = road_width["RR_WIDTH_P"].astype(float)

road_width.groupby("RR_TP_HIER")["RR_width_B"].mean().sort_values()

Unnamed: 0_level_0,RR_width_B
RR_TP_HIER,Unnamed: 1_level_1
MI,6.281454
MA,10.950232
CR,13.530612
IR,21.868217
PU,21.939439
OR,22.0
PR,50.0


In [8]:
road_width["RR_TP_HIER"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
RR_TP_HIER,Unnamed: 1_level_1
MA,0.453955
MI,0.382391
PU,0.115113
OR,0.021172
IR,0.016654
CR,0.010543
PR,0.000172


In [9]:
# Makes sense, Major Highways are built the most and Private roads the least.
# Since private roads are private, maybe inside some soceity or some housing area they have got a high width

road_width[road_width["RR_WIDTH_P"] <= road_width["RR_width_B"]].shape[0] / road_width.shape[0]

0.0

In [10]:
# seems like all the roads were built below planning

road_width = road_width[['RR_TP_HIER','RR_WIDTH_P', 'RR_TP_TYPE', 'SHAPE_LENG', 'RR_width_B', 'Shape_Le_1',
       'Shape_Le_2', 'geometry']]

road_width.to_pickle("bg_road_width_data.pkl")