# Desc
- Histo 

In [1]:
from pathlib import Path
import plotly.express as px
import geopandas as gpd
import numpy as np

sampling_gpkg = Path("../outputs/LiPaCConnector/TripleSampler-LiPaCConnector-extract.gpkg")
basename = sampling_gpkg.name
df = gpd.read_file(sampling_gpkg)
df.head()
nb_points_cols = [
    "nb_points_total",
    "nb_points_sol",
    "nb_points_bati",
    "nb_points_vegetation_basse",
    "nb_points_vegetation_moyenne",
    "nb_points_vegetation_haute",
    "nb_points_pont",
    "nb_points_eau",
    "nb_points_sursol_perenne",
    "nb_points_non_classes",
]
df = df[
    [
        "id",
        "dalle_id",
        "date_d_insertion",
        "date_de_mise_a_jour",
        "numero_de_colonne",
        "numero_de_ligne",
        "date_du_premier_point",
        "date_du_dernier_point",
        "denivele",
        "altitude",
        "presence_de_surfaces_d_eau",
        "presence_de_pylones",
        "presence_d_autoroutes",
        "is_test_set",
    ]
    + nb_points_cols
]

# TODO: define this as the column name
df["Split"] = df["is_test_set"].apply(lambda flag: "Test" if flag else "Train")

ERROR 1: PROJ: proj_create_from_database: Open of /home/CGaydon/anaconda3/envs/pacasam/share/proj failed


In [2]:
def make_class_histogram(df, nb_points_cols):
    df_bool = df.copy()
    nb_point_col_bool = [nb_point_col.replace("nb_points_", "") for nb_point_col in nb_points_cols]
    df_bool[nb_point_col_bool] = df_bool[nb_points_cols] > 0
    df_bool = df_bool.groupby("Split")[nb_point_col_bool].sum().transpose().sort_values(by="Train", ascending=False)
    print(df_bool)
    fig = px.bar(df_bool, color="Split", barmode="stack", text_auto=True, title="Nombres de patches avec classe présente.")
    return fig


fig_class_hist = make_class_histogram(df, nb_points_cols)

Split               Test  Train
total                499   4501
sol                  499   4495
vegetation_basse     465   4214
vegetation_moyenne   462   4171
vegetation_haute     450   4122
non_classes          207   1996
bati                 191   1788
eau                   43    337
pont                  13    130
sursol_perenne        19    125


In [3]:
from typing import List


def make_boolean_descriptor_histogram(df, bool_descriptors_cols: List[str]):
    df_bool = df.copy()
    df_bool = df_bool.groupby("Split")[bool_descriptors_cols].sum().transpose().sort_values(by="Train", ascending=True)
    print(df_bool)
    fig = px.bar(df_bool, color="Split", barmode="relative", text_auto=True, title="Nombres de patches", orientation="h")
    return fig


df["denivele_heq_45"] = df["denivele"] > 45
df["bati_heq_500"] = df["nb_points_bati"] > 500
fig_bool_desc = make_boolean_descriptor_histogram(df, ["presence_d_autoroutes", "denivele_heq_45", "bati_heq_500"])

Split                  Test  Train
presence_d_autoroutes     0      0
denivele_heq_45          57    479
bati_heq_500            183   1675


In [4]:
def make_class_histograms(df, nb_points_cols, output_path=None):
    # Passer à zéro, concernera les classes rares, permet distribution interprétable.
    figs = []
    df[nb_points_cols] = df[nb_points_cols].replace({0: np.nan})
    for c in nb_points_cols:
        fig = px.histogram(
            df,
            x=c,
            color="Split",
            marginal="box",
            hover_data=df.columns,
            opacity=0.5,
            # text_auto=True,
            labels={c: f"Nombre de points {c.replace('nb_points_','')} (valeurs nulles ignorées)"},
            barmode="overlay",
            title=f"Histogramme du nombres de points : {c.replace('nb_points_','')}",
        )  # or violin, rug
        # fig.update_yaxes(title_text='Compte')
        figs += [fig]
    return figs


fig_class_hist_nb_points = make_class_histograms(df, nb_points_cols, output_path=None)

In [5]:
import plotly.express as px
from sklearn.preprocessing import QuantileTransformer


def make_scatter_matrix_classes(df, nb_points_cols, norm=None, hide_zeros=True):
    """ """
    df_norm = df.copy()

    if hide_zeros:
        df_norm = df_norm.replace(to_replace=0, value=np.nan)

    if norm == "Standardization":
        # Quantilization enables to make classes "more" comparable in Farthest point Sampling,
        # and respects distribution within each class.
        df_norm.loc[:, nb_points_cols] = (df_norm.loc[:, nb_points_cols] - df.loc[:, nb_points_cols].mean()) / df_norm.loc[
            :, nb_points_cols
        ].std()
    elif norm == "Quantilization":
        # Quantilization enables to fully explore each X vs Y relationship.
        qt = QuantileTransformer(n_quantiles=50, random_state=0, subsample=100_000)
        df_norm.loc[:, nb_points_cols] = qt.fit_transform(df_norm.loc[:, nb_points_cols].values)

    if hide_zeros:
        # put zeros back
        df_norm.loc[:, nb_points_cols] = df_norm.loc[:, nb_points_cols].fillna(0)

    fig = px.scatter_matrix(
        df_norm,
        dimensions=nb_points_cols,
        color="Split",
        symbol="Split",
        opacity=0.9,
        labels={col: col.replace("nb_points_", "").replace("vegetation", "veg") for col in df.columns},
        width=1500,
        height=1500,
        title="Nombres de points" + (f" ({norm})" if norm else "") + (" (zéros ignorés)" if hide_zeros else ""),
    )  # remove underscore
    fig.update_traces(diagonal_visible=False)

    return fig


fig_scatter_matrix = make_scatter_matrix_classes(df, nb_points_cols, norm=None)
fig_scatter_matrix_standard = make_scatter_matrix_classes(df, nb_points_cols, norm="Standardization")
fig_scatter_matrix_quantile = make_scatter_matrix_classes(df, nb_points_cols, norm="Quantilization")

In [6]:
import plotly.graph_objects as go  # or plotly.express as px

import dash
from dash import dcc, html
from base64 import b64encode
import io

buffer = io.StringIO()
html_bytes = buffer.getvalue().encode()
encoded = b64encode(html_bytes).decode()
app = dash.Dash()
app.layout = html.Div(
    [
        html.H1(children="Dataviz - pacasam sampling"),
        html.Div(f"Fichier visualizé : {sampling_gpkg}"),
        dcc.Graph(figure=fig_class_hist),
        dcc.Graph(figure=fig_bool_desc),
    ]
    + [html.Div(f"Histogrammes des nombres de points")]
    + [dcc.Graph(figure=fig) for fig in fig_class_hist_nb_points]
    + [html.Div(f"Matrices - avec différentes normalization (cf. diversity sampling)")]
    + [dcc.Graph(figure=fig_scatter_matrix), dcc.Graph(figure=fig_scatter_matrix_standard), dcc.Graph(figure=fig_scatter_matrix_quantile)]
)

app.run_server(debug=True, use_reloader=False)  # Turn off reloader if inside Jupyter

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__'
 * Debug mode: on


In [None]:
app.

AttributeError: 'Dash' object has no attribute 'save_html'