In [23]:
# Imports
import time
from typing import Literal
from pydantic import Field
import pandas as pd
from pydantic import BaseModel
from sqlalchemy.orm import sessionmaker, relationship, joinedload
from db.mariadb_connector import engine as mariadb_engine

from datetime import datetime
from zoneinfo import ZoneInfo

from models.sdg_prediction import SDGPrediction
from models.publications.publication import Publication

from settings.settings import ExplainerSettings
from utils.env_loader import load_env, get_env_variable



In [24]:
from utils.logger import logger
logging = logger("exploration.log")

Configuring logger at: logs/exploration.log
Log file path: logs/exploration.log
Created Logger: <module 'logging' from '/Users/nicolas/miniconda3/envs/p3.10.14-mt-igcl/lib/python3.10/logging/__init__.py'>


In [25]:
# Initialize session
Session = sessionmaker(bind=mariadb_engine)

In [26]:
import models.sdg_prediction
import models.sdg_label_summary

In [27]:
import os

# Ensure directory for storing plots exists
output_dir = "sdg_histograms"
os.makedirs(output_dir, exist_ok=True)

In [28]:
AURORA_MODEL_STRING = "Aurora"
DVDBLK_MODEL_STRING = "Dvdblk"
DVDBLK_SOFTMAX_MODEL_STRING = "Dvdblk_Softmax"

models = [AURORA_MODEL_STRING, DVDBLK_MODEL_STRING, DVDBLK_SOFTMAX_MODEL_STRING]
#models = [AURORA_MODEL_STRING]

class Prediction(BaseModel):
    name: str
    predictions: list
    df: pd.DataFrame = Field(default=None)
    class Config:  # Allow arbitrary types like pd.DataFrame
        arbitrary_types_allowed = True

In [29]:
model_predictions: list[Prediction] = []

In [30]:
with Session() as session:
    for model in models:
        start = time.time()
        predictions = session.query(SDGPrediction).filter(SDGPrediction.prediction_model == model).all()
        end = time.time()
        print(f"  Found {len(predictions)} predictions for {model} model. Took {end - start} seconds.")
        # Convert predictions to a DataFrame
        predictions_data = [
            {
                "sdg1": p.sdg1, "sdg2": p.sdg2, "sdg3": p.sdg3, "sdg4": p.sdg4,
                "sdg5": p.sdg5, "sdg6": p.sdg6, "sdg7": p.sdg7, "sdg8": p.sdg8,
                "sdg9": p.sdg9, "sdg10": p.sdg10, "sdg11": p.sdg11, "sdg12": p.sdg12,
                "sdg13": p.sdg13, "sdg14": p.sdg14, "sdg15": p.sdg15, "sdg16": p.sdg16,
                "sdg17": p.sdg17
            }
            for p in predictions
        ]

        # Create DataFrame and add to Prediction instance
        df = pd.DataFrame(predictions_data)
        p = Prediction(name=model, predictions=predictions, df=df)
        model_predictions.append(p)



  Found 116959 predictions for Aurora model. Took 2.820990800857544 seconds.
  Found 116959 predictions for Dvdblk model. Took 3.222998857498169 seconds.
  Found 116959 predictions for Dvdblk_Softmax model. Took 3.313624143600464 seconds.


In [31]:
SDG_COLORS = {
    1: "#E5243B", 2: "#DDA63A", 3: "#4C9F38", 4: "#C5192D", 5: "#FF3A21",
    6: "#26BDE2", 7: "#FCC30B", 8: "#A21942", 9: "#FD6925", 10: "#DD1367",
    11: "#FD9D24", 12: "#BF8B2E", 13: "#3F7E44", 14: "#0A97D9", 15: "#56C02B",
    16: "#00689D", 17: "#19486A"
}

In [32]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [33]:
# Plot histograms and save as PDF
for prediction in model_predictions:
    model_name = prediction.name
    df = prediction.df
    print(f"Plotting and saving histograms for model: {model_name}")
    for i in range(1, 18):  # SDGs 1 through 17
        column_name = f"sdg{i}"
        if column_name in df.columns:
            # Plot the histogram
            plt.figure(figsize=(8, 6))
            sns.histplot(df[column_name], kde=True, color=SDG_COLORS[i], bins=10)
            plt.title(f"Histogram for SDG {i} ({model_name} model)", fontsize=16)
            plt.xlabel(f"SDG {i} Values", fontsize=14)
            plt.ylabel("Frequency", fontsize=14)
            plt.grid(axis="y", linestyle="--", alpha=0.7)

            # Save the plot as PDF
            pdf_filename = os.path.join(output_dir, f"{model_name}_SDG{i}.pdf")
            plt.savefig(pdf_filename, format="pdf", bbox_inches="tight")
            plt.close()  # Close the plot to free memory

Plotting and saving histograms for model: Aurora
Plotting and saving histograms for model: Dvdblk
Plotting and saving histograms for model: Dvdblk_Softmax


In [34]:
from matplotlib.backends.backend_pdf import PdfPages

In [35]:
# Plot histograms and save as a single PDF
for prediction in model_predictions:
    model_name = prediction.name
    df = prediction.df
    print(f"Plotting and saving histograms for model: {model_name} in a single PDF")

    # Define the output PDF filename
    pdf_filename = os.path.join(output_dir, f"{model_name}_All_SDG_Histograms.pdf")

    # Create a PdfPages object to store multiple plots
    with PdfPages(pdf_filename) as pdf:
        for i in range(1, 18):  # SDGs 1 through 17
            column_name = f"sdg{i}"
            if column_name in df.columns:
                # Plot the histogram
                plt.figure(figsize=(8, 6))
                sns.histplot(df[column_name], kde=True, color=SDG_COLORS[i], bins=10)
                plt.title(f"Histogram for SDG {i} ({model_name} model)", fontsize=16)
                plt.xlabel(f"SDG {i} Values", fontsize=14)
                plt.ylabel("Frequency", fontsize=14)
                plt.grid(axis="y", linestyle="--", alpha=0.7)

                # Add the plot to the PDF
                pdf.savefig()  # Save the current figure into the PDF
                plt.close()  # Close the plot to free memory

    print(f"Saved all histograms for model {model_name} in {pdf_filename}")

Plotting and saving histograms for model: Aurora in a single PDF
Saved all histograms for model Aurora in sdg_histograms/Aurora_All_SDG_Histograms.pdf
Plotting and saving histograms for model: Dvdblk in a single PDF
Saved all histograms for model Dvdblk in sdg_histograms/Dvdblk_All_SDG_Histograms.pdf
Plotting and saving histograms for model: Dvdblk_Softmax in a single PDF
Saved all histograms for model Dvdblk_Softmax in sdg_histograms/Dvdblk_Softmax_All_SDG_Histograms.pdf


In [36]:
# Plot histograms using subplots and save as a single PDF
for prediction in model_predictions:
    model_name = prediction.name
    df = prediction.df
    print(f"Plotting and saving histograms for model: {model_name} using subplots in a single PDF")

    # Define the output PDF filename
    pdf_filename = os.path.join(output_dir, f"{model_name}_All_SDG_Histograms_Subplots.pdf")

    # Create a PdfPages object to store the figure
    with PdfPages(pdf_filename) as pdf:
        # Create a figure for all histograms
        fig, axes = plt.subplots(5, 4, figsize=(20, 15))  # Adjust rows/cols as needed
        axes = axes.flatten()  # Flatten the 2D array of axes to make indexing easier

        for i in range(1, 18):  # SDGs 1 through 17
            column_name = f"sdg{i}"
            if column_name in df.columns:
                ax = axes[i - 1]  # Access the corresponding subplot
                sns.histplot(df[column_name], kde=True, color=SDG_COLORS[i], bins=10, ax=ax)
                ax.set_title(f"SDG {i} ({model_name})", fontsize=10)
                ax.set_xlabel(f"SDG {i} Values", fontsize=8)
                ax.set_ylabel("Frequency", fontsize=8)
                ax.grid(axis="y", linestyle="--", alpha=0.7)

        # Remove unused subplots (18th and 19th positions in this case)
        for j in range(17, len(axes)):
            fig.delaxes(axes[j])

        # Adjust layout
        plt.tight_layout()

        # Save the single figure with all histograms into the PDF
        pdf.savefig(fig)
        plt.close(fig)  # Close the figure to free memory

    print(f"Saved all histograms for model {model_name} in {pdf_filename}")

Plotting and saving histograms for model: Aurora using subplots in a single PDF
Saved all histograms for model Aurora in sdg_histograms/Aurora_All_SDG_Histograms_Subplots.pdf
Plotting and saving histograms for model: Dvdblk using subplots in a single PDF
Saved all histograms for model Dvdblk in sdg_histograms/Dvdblk_All_SDG_Histograms_Subplots.pdf
Plotting and saving histograms for model: Dvdblk_Softmax using subplots in a single PDF
Saved all histograms for model Dvdblk_Softmax in sdg_histograms/Dvdblk_Softmax_All_SDG_Histograms_Subplots.pdf


In [37]:
# Define the rescaling function
def rescale_sdg_value(value):
    threshold = 0.85
    lower_rescale_factor = 0.5 / 0.85
    upper_rescale_factor = 0.5 / 0.15
    if value < threshold:
        return lower_rescale_factor * value
    else:
        return upper_rescale_factor * value + (1 - upper_rescale_factor)

In [38]:
# Rescale and add columns, then plot
for prediction in model_predictions:
    model_name = prediction.name
    if model_name == "Aurora":
        df = prediction.df


        # Rescale each SDG column
        for i in range(1, 18):  # SDGs 1 through 17
            column_name = f"sdg{i}"
            rescaled_column_name = f"rescaled_{column_name}"
            if column_name in df.columns:
                df[rescaled_column_name] = df[column_name].apply(rescale_sdg_value)

        print(f"Plotting and saving histograms for rescaled columns of model: {model_name}")
        for i in range(1, 18):  # SDGs 1 through 17
            column_name = f"sdg{i}"
            rescaled_column_name = f"rescaled_{column_name}"
            if rescaled_column_name in df.columns:
                # Plot the histogram for rescaled values
                plt.figure(figsize=(8, 6))
                sns.histplot(df[rescaled_column_name], kde=True, color=SDG_COLORS[i], bins=10)
                plt.title(f"Histogram for Rescaled SDG {i} ({model_name} model)", fontsize=16)
                plt.xlabel(f"Rescaled SDG {i} Values", fontsize=14)
                plt.ylabel("Frequency", fontsize=14)
                plt.grid(axis="y", linestyle="--", alpha=0.7)

                # Save the plot as PDF
                pdf_filename = os.path.join(output_dir, f"{model_name}_Rescaled_SDG{i}.pdf")
                plt.savefig(pdf_filename, format="pdf", bbox_inches="tight")
                plt.close()  # Close the plot to free memory
    else:
        pass

Plotting and saving histograms for rescaled columns of model: Aurora


In [39]:
# Rescale and plot histograms using subplots
for prediction in model_predictions:
    model_name = prediction.name
    if model_name == "Aurora":
        df = prediction.df

        # Rescale each SDG column
        for i in range(1, 18):  # SDGs 1 through 17
            column_name = f"sdg{i}"
            rescaled_column_name = f"rescaled_{column_name}"
            if column_name in df.columns:
                df[rescaled_column_name] = df[column_name].apply(rescale_sdg_value)

        print(f"Plotting and saving histograms for rescaled columns of model: {model_name} using subplots in a single PDF")

        # Define the output PDF filename
        pdf_filename = os.path.join(output_dir, f"{model_name}_All_Rescaled_SDG_Histograms_Subplots.pdf")

        # Create a PdfPages object to store the figure
        with PdfPages(pdf_filename) as pdf:
            # Create a figure for all histograms
            fig, axes = plt.subplots(5, 4, figsize=(20, 15))  # Adjust rows/cols as needed
            axes = axes.flatten()  # Flatten the 2D array of axes to make indexing easier

            for i in range(1, 18):  # SDGs 1 through 17
                rescaled_column_name = f"rescaled_sdg{i}"
                if rescaled_column_name in df.columns:
                    ax = axes[i - 1]  # Access the corresponding subplot
                    sns.histplot(df[rescaled_column_name], kde=True, color=SDG_COLORS[i], bins=10, ax=ax)
                    ax.set_title(f"Rescaled SDG {i} ({model_name})", fontsize=10)
                    ax.set_xlabel(f"Rescaled SDG {i} Values", fontsize=8)
                    ax.set_ylabel("Frequency", fontsize=8)
                    ax.grid(axis="y", linestyle="--", alpha=0.7)

            # Remove unused subplots (18th and 19th positions in this case)
            for j in range(17, len(axes)):
                fig.delaxes(axes[j])

            # Adjust layout
            plt.tight_layout()

            # Save the single figure with all histograms into the PDF
            pdf.savefig(fig)
            plt.close(fig)  # Close the figure to free memory

        print(f"Saved all rescaled histograms for model {model_name} in {pdf_filename}")
    else:
        pass

Plotting and saving histograms for rescaled columns of model: Aurora using subplots in a single PDF
Saved all rescaled histograms for model Aurora in sdg_histograms/Aurora_All_Rescaled_SDG_Histograms_Subplots.pdf


In [40]:
def softmax_scaling(row):
    candidate_columns = [f"sdg{i}" for i in range(1, 17)]
    zero_class = "sdg17"

    # Identify the winner class (maximum value among SDG 1-16)
    winner_column = row[candidate_columns].idxmax()
    winner_value = row[winner_column]
    zero_value = row[zero_class]

    # Compute scaling factor
    total = winner_value + zero_value
    if total == 0:  # Avoid division by zero
        return {col: 0 for col in candidate_columns + [zero_class]}

    scale = 1 / total

    # Scale all values proportionally
    scaled_values = {col: row[col] * scale for col in candidate_columns + [zero_class]}
    return scaled_values

for prediction in model_predictions:
    model_name = prediction.name

    if model_name == "Dvdblk_Softmax":
        df = prediction.df

        # Apply scaling
        scaled_columns = df.apply(softmax_scaling, axis=1, result_type="expand")

        # Add scaled columns to the DataFrame
        for col in scaled_columns.columns:
            df[f"softmax_scaled_{col}"] = scaled_columns[col]

        print(f"Plotting and saving histograms for softmax scaled columns of model: {model_name}")
        for i in range(1, 18):  # SDGs 1 through 17
            scaled_column_name = f"softmax_scaled_sdg{i}"
            if scaled_column_name in df.columns:
                # Plot the histogram for scaled values
                plt.figure(figsize=(8, 6))
                sns.histplot(df[scaled_column_name], kde=True, color=SDG_COLORS[i], bins=10)
                plt.title(f"Histogram for Softmax Scaled SDG (All) {i} ({model_name} model)", fontsize=16)
                plt.xlabel(f"Softmax Scaled SDG {i} Values", fontsize=14)
                plt.ylabel("Frequency", fontsize=14)
                plt.grid(axis="y", linestyle="--", alpha=0.7)

                # Save the plot as PDF
                pdf_filename = os.path.join(output_dir, f"{model_name}_Softmax_Scaled_All_SDG{i}.pdf")
                plt.savefig(pdf_filename, format="pdf", bbox_inches="tight")
                plt.close()  # Close the plot to free memory
    else:
        pass


Plotting and saving histograms for softmax scaled columns of model: Dvdblk_Softmax


In [41]:
from matplotlib.backends.backend_pdf import PdfPages

def softmax_scaling(row):
    candidate_columns = [f"sdg{i}" for i in range(1, 17)]
    zero_class = "sdg17"

    # Identify the winner class (maximum value among SDG 1-16)
    winner_column = row[candidate_columns].idxmax()
    winner_value = row[winner_column]
    zero_value = row[zero_class]

    # Compute scaling factor
    total = winner_value + zero_value
    if total == 0:  # Avoid division by zero
        return {col: 0 for col in candidate_columns + [zero_class]}

    scale = 1 / total

    # Scale all values proportionally
    scaled_values = {col: row[col] * scale for col in candidate_columns + [zero_class]}
    return scaled_values

# Process the specific model
for prediction in model_predictions:
    model_name = prediction.name

    if model_name == "Dvdblk_Softmax":
        df = prediction.df

        # Apply scaling
        scaled_columns = df.apply(softmax_scaling, axis=1, result_type="expand")

        # Add scaled columns to the DataFrame
        for col in scaled_columns.columns:
            df[f"softmax_scaled_{col}"] = scaled_columns[col]

        print(f"Plotting and saving histograms for softmax scaled columns of model: {model_name} using subplots in a single PDF")

        # Define the output PDF filename
        pdf_filename = os.path.join(output_dir, f"{model_name}_Softmax_Scaled_All_SDG_Histograms_Subplots.pdf")

        # Create a PdfPages object to store the figure
        with PdfPages(pdf_filename) as pdf:
            # Create a figure for all histograms
            fig, axes = plt.subplots(5, 4, figsize=(20, 15))  # Adjust rows/cols as needed
            axes = axes.flatten()  # Flatten the 2D array of axes to make indexing easier

            for i in range(1, 18):  # SDGs 1 through 17
                scaled_column_name = f"softmax_scaled_sdg{i}"
                if scaled_column_name in df.columns:
                    ax = axes[i - 1]  # Access the corresponding subplot
                    sns.histplot(df[scaled_column_name], kde=True, color=SDG_COLORS[i], bins=10, ax=ax)
                    ax.set_title(f"Softmax Scaled SDG {i} ({model_name})", fontsize=10)
                    ax.set_xlabel(f"Softmax Scaled SDG {i} Values", fontsize=8)
                    ax.set_ylabel("Frequency", fontsize=8)
                    ax.grid(axis="y", linestyle="--", alpha=0.7)

            # Remove unused subplots (18th and 19th positions in this case)
            for j in range(17, len(axes)):
                fig.delaxes(axes[j])

            # Adjust layout
            plt.tight_layout()

            # Save the single figure with all histograms into the PDF
            pdf.savefig(fig)
            plt.close(fig)  # Close the figure to free memory

        print(f"Saved all softmax scaled histograms for model {model_name} in {pdf_filename}")
    else:
        pass


Plotting and saving histograms for softmax scaled columns of model: Dvdblk_Softmax using subplots in a single PDF
Saved all softmax scaled histograms for model Dvdblk_Softmax in sdg_histograms/Dvdblk_Softmax_Softmax_Scaled_All_SDG_Histograms_Subplots.pdf


In [42]:
def partial_softmax_scaling(row):
    candidate_columns = [f"sdg{i}" for i in range(1, 17)]
    zero_class = "sdg17"

    # Identify the winner class (maximum value among SDG 1-16)
    winner_column = row[candidate_columns].idxmax()
    winner_value = row[winner_column]
    zero_value = row[zero_class]

    # Compute scaling factor
    total = winner_value + zero_value
    if total == 0:  # Avoid division by zero
        return {col: row[col] for col in candidate_columns + [zero_class]}

    scale = 1 / total

    # Scale only winner and zero class
    scaled_values = {col: row[col] for col in candidate_columns + [zero_class]}
    scaled_values[winner_column] *= scale
    scaled_values[zero_class] *= scale
    return scaled_values

# Apply softmax scaling to all rows
for prediction in model_predictions:
    model_name = prediction.name

    if model_name == "Dvdblk_Softmax":
        df = prediction.df

        # Apply scaling
        scaled_columns = df.apply(softmax_scaling, axis=1, result_type="expand")

        # Add scaled columns to the DataFrame
        for col in scaled_columns.columns:
            df[f"softmax_scaled_{col}"] = scaled_columns[col]

        print(f"Plotting and saving histograms for softmax scaled columns of model: {model_name}")
        for i in range(1, 18):  # SDGs 1 through 17
            scaled_column_name = f"softmax_scaled_sdg{i}"
            if scaled_column_name in df.columns:
                # Plot the histogram for scaled values
                plt.figure(figsize=(8, 6))
                sns.histplot(df[scaled_column_name], kde=True, color=SDG_COLORS[i], bins=10)
                plt.title(f"Histogram for Softmax Scaled SDG (Winner-Null) {i} ({model_name} model)", fontsize=16)
                plt.xlabel(f"Softmax Scaled SDG {i} Values", fontsize=14)
                plt.ylabel("Frequency", fontsize=14)
                plt.grid(axis="y", linestyle="--", alpha=0.7)

                # Save the plot as PDF
                pdf_filename = os.path.join(output_dir, f"{model_name}_Softmax_Scaled_Winner_Null_SDG{i}.pdf")
                plt.savefig(pdf_filename, format="pdf", bbox_inches="tight")
                plt.close()  # Close the plot to free memory
    else:
        pass

Plotting and saving histograms for softmax scaled columns of model: Dvdblk_Softmax


In [43]:
from matplotlib.backends.backend_pdf import PdfPages

def partial_softmax_scaling(row):
    candidate_columns = [f"sdg{i}" for i in range(1, 17)]
    zero_class = "sdg17"

    # Identify the winner class (maximum value among SDG 1-16)
    winner_column = row[candidate_columns].idxmax()
    winner_value = row[winner_column]
    zero_value = row[zero_class]

    # Compute scaling factor
    total = winner_value + zero_value
    if total == 0:  # Avoid division by zero
        return {col: row[col] for col in candidate_columns + [zero_class]}

    scale = 1 / total

    # Scale only winner and zero class
    scaled_values = {col: row[col] for col in candidate_columns + [zero_class]}
    scaled_values[winner_column] *= scale
    scaled_values[zero_class] *= scale
    return scaled_values

# Process the specific model
for prediction in model_predictions:
    model_name = prediction.name

    if model_name == "Dvdblk_Softmax":
        df = prediction.df

        # Apply scaling
        scaled_columns = df.apply(partial_softmax_scaling, axis=1, result_type="expand")

        # Add scaled columns to the DataFrame
        for col in scaled_columns.columns:
            df[f"softmax_scaled_{col}"] = scaled_columns[col]

        print(f"Plotting and saving histograms for partial softmax scaled columns of model: {model_name} using subplots in a single PDF")

        # Define the output PDF filename
        pdf_filename = os.path.join(output_dir, f"{model_name}_Partial_Softmax_Scaled_Winner_Null_SDG_Histograms_Subplots.pdf")

        # Create a PdfPages object to store the figure
        with PdfPages(pdf_filename) as pdf:
            # Create a figure for all histograms
            fig, axes = plt.subplots(5, 4, figsize=(20, 15))  # Adjust rows/cols as needed
            axes = axes.flatten()  # Flatten the 2D array of axes to make indexing easier

            for i in range(1, 18):  # SDGs 1 through 17
                scaled_column_name = f"softmax_scaled_sdg{i}"
                if scaled_column_name in df.columns:
                    ax = axes[i - 1]  # Access the corresponding subplot
                    sns.histplot(df[scaled_column_name], kde=True, color=SDG_COLORS[i], bins=10, ax=ax)
                    ax.set_title(f"Partial Softmax Scaled SDG {i} ({model_name})", fontsize=10)
                    ax.set_xlabel(f"Scaled SDG {i} Values", fontsize=8)
                    ax.set_ylabel("Frequency", fontsize=8)
                    ax.grid(axis="y", linestyle="--", alpha=0.7)

            # Remove unused subplots (18th and 19th positions in this case)
            for j in range(17, len(axes)):
                fig.delaxes(axes[j])

            # Adjust layout
            plt.tight_layout()

            # Save the single figure with all histograms into the PDF
            pdf.savefig(fig)
            plt.close(fig)  # Close the figure to free memory

        print(f"Saved all partial softmax scaled histograms for model {model_name} in {pdf_filename}")
    else:
        pass


Plotting and saving histograms for partial softmax scaled columns of model: Dvdblk_Softmax using subplots in a single PDF
Saved all partial softmax scaled histograms for model Dvdblk_Softmax in sdg_histograms/Dvdblk_Softmax_Partial_Softmax_Scaled_Winner_Null_SDG_Histograms_Subplots.pdf
