In [None]:
#Preprocessing of BBR and damage data

# This Notebook is a collection of the scripts that extracts and manipulates the BBR data into two datasets: 
# one of all BBR building data points in Denmark and one of all damage building points in Denmark.

In [None]:
import ijson
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point
from geopandas.tools import sjoin

In [None]:
#Script that extracts the relevant data from the BygningList by streaming and parsing the collected BBR JSON file

#The output directory is called 'Processed_withID' and later 'Batched_withID_onlyonce' because an initial extraction
#mistakenly had left out the 'id_lokalId', which was important to keep track of individual buildings

# Data paths
large_file_path = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\BBR TOTAL\\BBR_total_20230130123400.json"
output_directory = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Processed_withID\\"

def extract_attributes(item):
    attributes = [
        'forretningshændelse', 'byg021BygningensAnvendelse', 'byg026Opførelsesår',
        'byg027OmTilbygningsår', 'byg032YdervæggensMateriale','byg033Tagdækningsmateriale',
        'byg054AntalEtager','byg056Varmeinstallation','byg404Koordinat','byg406Koordinatsystem', 'id_lokalId'
    ]
    return {attr: item.get(attr, None) for attr in attributes}

# Parameters for splitting data
chunk_size = 100000  # Number of buildings per file
current_chunk = 0
current_data = []

# Start streaming and parsing the JSON file
with open(large_file_path, 'r', encoding="utf-8") as file:
    bygning_list = ijson.items(file, 'BygningList.item')
    for index, item in enumerate(bygning_list):
        current_data.append(extract_attributes(item))
        
        # Splitting and saving the data every chunk_size items
        if (index + 1) % chunk_size == 0:
            with open(output_directory + f"data_chunk_{current_chunk}.json", 'w', encoding="utf-8") as output_file:
                json.dump(current_data, output_file)
            current_data = []  # Clear the current data
            current_chunk += 1
            
    # Saving remaining data that didn't make up a full chunk
    if current_data:
        with open(output_directory + f"data_chunk_{current_chunk}.json", 'w', encoding="utf-8") as output_file:
            json.dump(current_data, output_file)

print(f"Data processed into {current_chunk + 1} chunks.")

In [None]:
#Merging the BygningList chunks into batches of 25 chunks per batch

# Directory where the chunks are located
input_directory = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Processed_withID\\"

# Number of chunks in each batch
chunks_per_batch = 25

# Iterate through each chunk and append the data to a batch list
for batch in range(0, 399, chunks_per_batch):
    batch_data = []
    for index in range(batch, min(batch + chunks_per_batch, 399)):
        with open(input_directory + f"data_chunk_{index}.json", 'r', encoding="utf-8") as input_file:
            data = json.load(input_file)
            batch_data.extend(data)
    
    # Save each batch to a separate JSON file
    output_path = f"C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Batched_withID\\batch_{batch//chunks_per_batch}.json"
    with open(output_path, 'w', encoding="utf-8") as output_file:
        json.dump(batch_data, output_file)

    print(f"Saved batch {batch//chunks_per_batch} to {output_path}")


In [None]:
#Exploring the first batch to ensure that the data is as expected

# Load JSON data
with open('C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Batched_withID_onlyonce\\batch_0.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Convert the data to a DataFrame
bygning_df = pd.DataFrame(data)

# Set display options to show all columns
pd.set_option('display.max_columns', None)

# Display the table
print(bygning_df)

In [None]:
#This was the extraction of the BygningList and the same was done for the EtageList in the following

In [None]:
#Script that extracts the relevant data from the EtageList by streaming and parsing the collected BBR JSON file
#The relevant data from this list is for the basements
#The data is saved in many small chunks

# Define your data path for the large file
large_file_path = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\BBR TOTAL\\BBR_total_20230130123400.json"
output_directory = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\EtageList\\Chunks\\"  # Store processed files here

def extract_attributes(item):
    attributes = ['eta006BygningensEtagebetegnelse', 'eta020SamletArealAfEtage', 'bygning']
    return {attr: item.get(attr, None) for attr in attributes}

# Parameters for splitting data
chunk_size = 100000  # Number of buildings per file
current_chunk = 0
current_data = []

# Start streaming and parsing the JSON file
with open(large_file_path, 'r', encoding="utf-8") as file:
    bygning_list = ijson.items(file, 'EtageList.item')
    for index, item in enumerate(bygning_list):
        current_data.append(extract_attributes(item))
        
        # Split and save the data every chunk_size items
        if (index + 1) % chunk_size == 0:
            with open(output_directory + f"data_chunk_{current_chunk}.json", 'w', encoding="utf-8") as output_file:
                json.dump(current_data, output_file)
            current_data = []  # Clear the current data
            current_chunk += 1
            
    # Save any remaining data that didn't make up a full chunk
    if current_data:
        with open(output_directory + f"data_chunk_{current_chunk}.json", 'w', encoding="utf-8") as output_file:
            json.dump(current_data, output_file)
            
print(f"Data processed into {current_chunk + 1} chunks.")

In [None]:
#Merging the EtageList chunks into batches of 25 chunks per batch

# Directory where the chunks are located
input_directory = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\EtageList\\Chunks\\"

# Number of chunks in each batch
chunks_per_batch = 25

# Iterate through each chunk and append the data to a batch list
for batch in range(0, 731, chunks_per_batch):
    batch_data = []
    for index in range(batch, min(batch + chunks_per_batch, 399)):
        with open(input_directory + f"data_chunk_{index}.json", 'r', encoding="utf-8") as input_file:
            data = json.load(input_file)
            batch_data.extend(data)
    
    # Save each batch to a separate JSON file
    output_path = f"C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\EtageList_2\\Batches\\batch_{batch//chunks_per_batch}.json"
    with open(output_path, 'w', encoding="utf-8") as output_file:
        json.dump(batch_data, output_file)

    print(f"Saved batch {batch//chunks_per_batch} to {output_path}")

In [None]:
#The first bacth is explored to make sure it contains the data

# Load JSON data
with open('C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\EtageList\\Batches\\batch_0.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Convert the data to a DataFrame
bygning_df = pd.DataFrame(data)

# Set display options to show all columns
pd.set_option('display.max_columns', None)

# Display the table
print(bygning_df)

# Summary of the BBR data in first batch
print("Data Description:")
print(bygning_df.describe())

# Describe only object (or string) columns
print(bygning_df.describe(include=['object']))

# Check for missing values
print("\nMissing Values:")
print(bygning_df.isnull().sum())

# Count of unique values in each column (for non-numeric columns)
for column in bygning_df.columns:
    if bygning_df[column].dtype == 'object':
        print(f"\nUnique values in {column}: {bygning_df[column].nunique()}")

In [None]:
#In the first batch, there were 37259 buildings with basements (out of 250.000)... 
#It was decided to keep exploring it as a binary variable
print(bygning_df['eta006BygningensEtagebetegnelse'].value_counts())


In [None]:
#The extracted data from the two lists are merged. 
#The data from EtageList is filtered ('def filter_rows') to create a binary basement variable (if a building has a basement 
#or not) and to add the data for the basement area for those buildings that has a basement

# Load the batched data from BygningList
bygninglist_data_dir = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Batched_withID"
bygninglist_files = [f for f in os.listdir(bygninglist_data_dir) if f.endswith('.json')]
bygninglist_data = []
for file in bygninglist_files:
    full_path = os.path.join(bygninglist_data_dir, file)
    bygninglist_data.append(pd.read_json(full_path))

all_bygninglist_data = pd.concat(bygninglist_data, ignore_index=True)

# Load the batched data from EtageList
etagelist_data_dir = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\EtageList\\Batches"
etagelist_files = [f for f in os.listdir(etagelist_data_dir) if f.endswith('.json')]
etagelist_data = []
for file in etagelist_files:
    full_path = os.path.join(etagelist_data_dir, file)
    etagelist_data.append(pd.read_json(full_path))

all_etagelist_data = pd.concat(etagelist_data, ignore_index=True)

# Filter rows
def filter_rows(group):
    kl_row = group[group['eta006BygningensEtagebetegnelse'] == 'kl']
    if kl_row.empty:
        group['eta006BygningensEtagebetegnelse'] = None
        group['eta020SamletArealAfEtage'] = None
        return group.iloc[0]
    else:
        return kl_row.iloc[0]

filtered_etagelist_df = all_etagelist_data.groupby('bygning').apply(filter_rows).reset_index(drop=True)

# Merging the datasets
merged_df = pd.merge(all_bygninglist_data, filtered_etagelist_df, left_on='id_lokalId', right_on='bygning', how='left')

# Saving the merged dataset
output_path = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Final\\Denmark_BBR.json"
merged_df.to_json(output_path, orient='records', lines=True)
print(f"Saved merged data to {output_path}")

# Data Visualization and Description
print("Data Description:")
print(merged_df.describe())

# Check for missing values
print("\nMissing Values:")
print(merged_df.isnull().sum())

In [None]:
#The Denmark_BBR.json file was explored, and some manipulation and data preparation was made
#the coordinates (points) were extracted based on the attribute byg404Koordinat and the data can then be read as a geodataframe
#with its x and y coloumns
#The construction year (byg026Opførelsesår) was manipulated to handle unrealistic values. Many buildings that are used as sheds,
#garages etc. had a construction year of 1000. The oldest building in Denmark is from 1425.
#The floors was manipulated similarly. The tallest building in Denmark is 30 floors.

# Load Denmark_final_BBR.json file
denmark_bbr = gpd.read_file("C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Final\\Denmark_BBR.json")

# Display the first few rows to ensure it's loaded correctly
print(denmark_bbr.head())

def extract_coordinates(point_str):
    try:
        # Extract the coordinates from the POINT string
        x, y = point_str.replace("POINT(", "").replace(")", "").split()
        return float(x), float(y)
    except:
        return None, None

# Extract x and y coordinates
denmark_bbr['x'], denmark_bbr['y'] = zip(*denmark_bbr['byg404Koordinat'].map(extract_coordinates))

# Convert the DataFrame to a GeoDataFrame using the x and y columns as geometry
gdf = gpd.GeoDataFrame(denmark_bbr, geometry=gpd.points_from_xy(denmark_bbr['x'], denmark_bbr['y']))

# Set the CRS to EPSG:25832
gdf.crs = "EPSG:25832"

# Display the first few rows to ensure the geometry has been set correctly
print(gdf.head())

# Handle unrealistic construction year values
mask_invalid_years1 = (gdf['byg026Opførelsesår'] <= 1425) | (gdf['byg026Opførelsesår'] >= 2023)
gdf.loc[mask_invalid_years1, 'byg026Opførelsesår'] = np.nan

mask_invalid_years2 = (gdf['byg027OmTilbygningsår'] < 1425) | (gdf['byg027OmTilbygningsår'] >= 2023)
gdf.loc[mask_invalid_years2, 'byg027OmTilbygningsår'] = np.nan

# Handle unrealistic values for 'byg054AntalEtager'
# A reasonable range is from 0 to 30 floors
mask_invalid_floors = (gdf['byg054AntalEtager'] < 0) | (gdf['byg054AntalEtager'] >= 30)
gdf.loc[mask_invalid_floors, 'byg054AntalEtager'] = np.nan

In [None]:
#Some further exploration was done for missing values and dublicate 'id_lokalId'
#Many buildings were dublicate

# Check the number of rows
num_rows = len(gdf)
print(f"Number of rows in the dataset: {num_rows}")

# Display the first 10 rows
print("\nFirst 10 rows:\n")
print(gdf.head(10))

# Display the last 10 rows
print("\nLast 10 rows:\n")
print(gdf.tail(10))

missing = gdf.isnull().sum()
percentage_missing = (gdf.isnull().sum()/len(gdf))*100
missing_data = pd.DataFrame({'Number of Missing Values': missing, 'Percentage (%)': percentage_missing})
print("\nMissing Values:\n")
print(missing_data)

# Check dublicate values for id_lokalId
unique_ids = gdf['id_lokalId'].nunique()
if unique_ids == num_rows:
    print("\nEvery 'id_lokalId' is unique.")
else:
    print(f"\nThere are {num_rows - unique_ids} 'id_lokalId' values that are not unique.")

In [None]:
# Every case of dublicate id_lokalId was identical, and the GeoDataFrame was therefore filtered to keep only the first 
#occurrence of each id_lokalId. The new GeoDataFrame was saved. 

unique_gdf = gdf.drop_duplicates(subset='id_lokalId', keep='first')

# Verify the number of unique rows
print(f"Number of unique rows based on 'id_lokalId': {len(unique_gdf)}")

# Save the filtered dataset to a new file
output_path_unique = "C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Final\\Denmark_BBR_unique.geojson"
unique_gdf.to_file(output_path_unique, driver="GeoJSON")
print(f"Saved unique data to {output_path_unique}")

In [None]:
#This script joins the damage case data to the nearest merged building point. 

#The damage case data from Geo's archive is not necessarily located precisely within the building polygons, but when exploring
#the damage points, it often makes sense which building has endured damage, since the point is often right outside a 
#building polygon. 
#However, in many cases, the building that is nearest to the damage point is a shed, garage etc. And not likely to be 
#the actual damaged building of interest. Because of this, when joining the damage data to the BBR building point, 
#a buffer of 50m was created to select the nearest buildings of each damage point. The buildings that was then considered 
#unlikely to have endured damage was then excluded in the sense that, if the nearest building within the 50m buffer was a 
#garage, carport, shed, greenhouse, #canopy or conservatory, the next building was chosen (untill it was not one of these 
#building-uses).

# Reading the manipulated BBR Denmark
gdf = gpd.read_file("C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Final\\Denmark_BBR_unique.geojson")

# Reading the damage case data
damage_cases = gpd.read_file("C:\\Users\\IM\\Desktop\\THESIS\\Data\\Skadesager\\Damage_DK.shp")
damage_cases = damage_cases.to_crs('EPSG:25832')
damage_cases['buffer'] = damage_cases.buffer(50)

# Using a spatial index
sindex = gdf.sindex

# Placeholder
closest_buildings = []

# List of building uses to exclude ('910': 'Garage', '920': 'Carport', '930': 'Shed', '940': 'Greenhouse',
#'950': 'Detached canopy', '960': 'Detached conservatory')
excluded_uses = ['910', '920', '930', '940', '950', '960']

#Loop that iterates over every damage point and finds the nearest building that meets the criteria
for index, row in damage_cases.iterrows():
    # Get the bounding box coordinates of the buffer for the current damage case
    bounds = row['buffer'].bounds

    # Use spatial index to find which buildings are within the bounding box of the buffer
    possible_matches_index = list(sindex.intersection(bounds))
    possible_matches = gdf.iloc[possible_matches_index]

    # Check which of these possible matches are actually within the buffer
    true_matches = possible_matches[possible_matches.intersects(row['buffer'])].copy()

    # If there are matches within the buffer, calculate the distance
    if not true_matches.empty:
        true_matches['dist'] = true_matches.distance(row['geometry'])
        
        # Sort the buildings by distance
        true_matches = true_matches.sort_values(by='dist')
        
        # Iterate through sorted buildings and select the first one that doesn't have an excluded use
        selected_building = None
        for _, building in true_matches.iterrows():
            if building['byg021BygningensAnvendelse'] not in excluded_uses:
                selected_building = building
                break
        
        # Append the selected building, if any, to the results
        if selected_building is not None:
            closest_buildings.append(selected_building)

# Convert results list to GeoDataFrame
closest_buildings_gdf = gpd.GeoDataFrame(closest_buildings, columns=gdf.columns, crs="EPSG:25832")

# Handle byte data if necessary
byte_cols = [col for col in closest_buildings_gdf.columns if closest_buildings_gdf[col].apply(lambda x: isinstance(x, bytes)).any()]
for col in byte_cols:
    closest_buildings_gdf[col] = closest_buildings_gdf[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

# Save to file
closest_buildings_gdf.to_file("C:\\Users\\IM\\Desktop\\THESIS\\Data\\BBR\\Final\\Damage_final_BBR.geojson", driver='GeoJSON')

# Display results
print(closest_buildings_gdf)