# EDA
This Notebook creates the EDA tables from the Appendix as well as the barcharts for the distribution of polluted soil samples between datasets and risk zone. The tables illustrates the distribution of polluted and non-polluted samples within and outside the industrial zones for each dataset, and further breaks down the samples within the three risk categories. 

In [None]:
# Imports
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
data_zaanstad = gpd.read_file("../Data/dataset_zaanstad.gpkg", layer="polluted_points")
data_oosterhout = gpd.read_file("../Data/dataset_oosterhout.gpkg", layer="polluted_points")

## Tables

In [None]:
columnName = 'industry'

counts_zaanstad = data_zaanstad.groupby(['TOETS_WBB', 'BKK', columnName]).size().unstack().reset_index()
counts_zaanstad = counts_zaanstad[counts_zaanstad['BKK'] != 'Onbekend']

# Transforming the DataFrame to the desired format
zaanstad = pd.pivot_table(counts_zaanstad, values=[True, False], index=['BKK'], columns=['TOETS_WBB'])
zaanstad = zaanstad.rename(columns={False: "Buiten", True: "Binnen"})
desired_order = ['AW_2000', 'Wonen', 'Industrie']
zaanstad = zaanstad.reindex(desired_order)
                          
counts_oosterhout = data_oosterhout.groupby(['TOETS_WBB', 'BKK', columnName]).size().unstack().reset_index()
counts_oosterhout = counts_oosterhout[counts_oosterhout['BKK'] != 'Onbekend']

# Transforming the DataFrame to the desired format
oosterhout = pd.pivot_table(counts_oosterhout, values=[True, False], index=['BKK'], columns=['TOETS_WBB'])
oosterhout = oosterhout.rename(columns={False: "Buiten", True: "Binnen"})
desired_order = ['AW_2000', 'Wonen', 'Industrie']
oosterhout = oosterhout.reindex(desired_order)

In [None]:
zaanstad

In [None]:
oosterhout

## Barcharts

In [None]:
# Count the occurrences of >I and <I in TOETS_WBB for each dataset
zaanstad_counts = data_zaanstad['TOETS_WBB'].value_counts()
oosterhout_counts = data_oosterhout['TOETS_WBB'].value_counts()

# Combine the datasets for the third plot
combined_data = pd.concat([data_zaanstad, data_oosterhout])
combined_counts = combined_data['TOETS_WBB'].value_counts()

# Set the font size
plt.rcParams.update({'font.size': 16})

# Create the bar charts
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), sharey=False)

# Zaanstad
axes[0].bar(zaanstad_counts.index, zaanstad_counts.values, color=['green', 'red'])
axes[0].set_title('Zaanstad')
axes[0].set_xlabel('Sample Classes')
axes[0].set_ylabel('Count')

# Oosterhout
axes[1].bar(oosterhout_counts.index, oosterhout_counts.values, color=['green', 'red'])
axes[1].set_title('Oosterhout')
axes[1].set_xlabel('Sample Classes')
axes[1].set_ylabel('Count')

# Combined
axes[2].bar(combined_counts.index, combined_counts.values, color=['green', 'red'])
axes[2].set_title('Combined')
axes[2].set_xlabel('Sample Classes')
axes[2].set_ylabel('Count')

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Mapping dictionary
bkk_mapping = {
    'AW_2000': 'Low risk',
    'Industrie': 'High risk',
    'Wonen': 'Medium risk'
}

# Function to filter out 'Onbekend', map BKK values, and get counts based on TOETS_WBB and BKK
def filter_map_and_get_counts(data):
    filtered_data = data[data['BKK'] != 'Onbekend']
    filtered_data.replace({"BKK": bkk_mapping}, inplace=True)
    return filtered_data.groupby(['BKK', 'TOETS_WBB']).size().unstack(fill_value=0).reindex(['Low risk', 'Medium risk', 'High risk'])

# Get filtered and mapped counts for each dataset
zaanstad_counts = filter_map_and_get_counts(data_zaanstad)
oosterhout_counts = filter_map_and_get_counts(data_oosterhout)

# Combine the datasets for the third plot
combined_data = pd.concat([data_zaanstad, data_oosterhout])
combined_filtered_data = combined_data[combined_data['BKK'] != 'Onbekend']
combined_filtered_data.replace({"BKK": bkk_mapping}, inplace=True)
combined_counts = combined_filtered_data.groupby(['BKK', 'TOETS_WBB']).size().unstack(fill_value=0).reindex(['Low risk', 'Medium risk', 'High risk'])

# Set the font size
plt.rcParams.update({'font.size': 20})

# Create the bar charts
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 8), sharey=False)

def plot_bars(ax, counts, title):
    bkk_values = counts.index
    index = range(len(bkk_values))
    bar_width = 0.35
    opacity = 0.8
    
    ax.bar(index, counts['<I'], bar_width, alpha=opacity, color='g', label='<I')
    ax.bar([i + bar_width for i in index], counts['>I'], bar_width, alpha=opacity, color='r', label='>I')

    ax.set_ylabel('Count')
    ax.set_title(title)
    ax.set_xticks([i + bar_width / 2 for i in index])
    ax.set_xticklabels(bkk_values, rotation=45)
    ax.legend()

# Zaanstad
plot_bars(axes[0], zaanstad_counts, 'Zaanstad')

# Oosterhout
plot_bars(axes[1], oosterhout_counts, 'Oosterhout')

# Combined
plot_bars(axes[2], combined_counts, 'Combined')

# Display the plot
plt.tight_layout()
plt.show()