In [74]:
from scipy.cluster.hierarchy import dendrogram, linkage
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import plotly.graph_objs as go
import seaborn as sns
plt.ioff()
plt.clf()

In [75]:
%run 'Lab_preprocess.ipynb'

In [142]:
def plot1(Z):
    """
    Generate a dendrogram plot from hierarchical clustering data.

    Parameters
    ----------
    Z : array_like
        The hierarchical clustering encoded as a linkage matrix.

    Returns
    -------
    Figure
        A matplotlib Figure object containing the dendrogram plot.
    """
    fig, ax = plt.subplots(figsize=(10, 6))
    dn = dendrogram(Z, truncate_mode='level', p=5, ax=ax)
    colors = ['#205e55', '#8e9b8c', '#7cb79d']
    for i, d in enumerate(dn['dcoord']):
        color = colors[i % len(colors)]  # Cycle through your colors
        x = dn['icoord'][i]
        ax.plot(x, d, color=color)
    ax.set_ylabel(r"$\Delta$")
    plt.title('Condensed Dendrogram with at most 5 levels')
    plt.tight_layout()
    return fig

In [147]:
def dendro_plot():
    """
    Generate a dendrogram plot of clustered data.

    Returns
    -------
    Figure
        A matplotlib Figure object containing the dendrogram plot.

    """
    Z = linkage(country_pca, method='ward', optimal_ordering=True)
    fig, ax = plt.subplots(figsize=(10, 6))
    dn = dendrogram(Z, ax=ax)
    colors = ['#205e55', '#8e9b8c', '#7cb79d']
    for i, d in enumerate(dn['dcoord']):
        color = colors[i % len(colors)]  # Cycle through your colors
        x = dn['icoord'][i]
        ax.plot(x, d, color=color)
    ax.set_ylabel(r"$h$")
    plt.tight_layout()
    plt.title('Original Dendrogram')
    return fig

In [146]:
dendro_original = dendro_plot();

In [119]:
Z = linkage(country_pca, method='ward', optimal_ordering=True)
dendrogram_levelled = plot1(Z);

In [81]:
def plotting(x, y, z, labels):
    """
    Generate a 3D scatter plot of data points with custom color mapping.

    Parameters
    ----------
    x : array_like
        The x-coordinates of the data points.
    y : array_like
        The y-coordinates of the data points.
    z : array_like
        The z-coordinates of the data points.
    labels : array_like
        Cluster labels for each data point.

    Returns
    -------
    Figure
        A Plotly Figure object containing the 3D scatter plot.

    """
    x = x
    y = y
    z = z

    custom_colors = ['#205e55', '#8e9b8c', '#7cb79d']

    # Create a list of colors for each label
    color_mapped = [custom_colors[label] for label in labels]

    trace_data = go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers',
        marker=dict(
            size=8,
            color=color_mapped,  # set color to an array/list of desired values
            colorscale='Viridis',  # choose a colorscale
            opacity=0.8
        )
    )

    data = [trace_data]

    layout = go.Layout(
        margin=dict(l=0, r=0, b=0, t=0),  # tight layout
        title='K-Medoids: Scatter Plot',
        scene=dict(
            xaxis=dict(title='PCA 1'),
            yaxis=dict(title='PCA 2'),
            zaxis=dict(title='PCA 3')
        )
    )

    fig = go.Figure(data=data, layout=layout)

    # Render the plot
    return fig

In [113]:
agg = AgglomerativeClustering(n_clusters=None, linkage="ward",
                              distance_threshold=400)
y_predict_country_com = agg.fit_predict(country_pca)


def comp_scatter(y_predict_country_com):
    """
    Generate a scatter plot of clustered data with custom color mapping.

    Returns
    -------
    Figure
        A matplotlib Figure object containing the scatter plot.

    """

    custom_colors = ['#205e55', '#8e9b8c']
    color_mapped = [custom_colors[label] for label in y_predict_country_com]

    # Create a figure with specified size
    fig, ax = plt.subplots(figsize=(10, 6))

    # Scatter plot on the axes
    ax.scatter(country_pca.iloc[:, 0], country_pca.iloc[:, 1], c=color_mapped)

    # Return the figure object
    plt.title('Ward Linkage')
    plt.tight_layout()
    return fig

In [101]:
ward = comp_scatter(y_predict_country_com)

In [103]:
comp_3d = plotting(country_pca.iloc[:, 0], country_pca.iloc[:, 1],
                   country_pca.iloc[:, 2], y_predict_country_com)

For Complete

In [105]:
df_final['label'] = y_predict_country_com
df_graph = df_final.iloc[:, 2:]

In [150]:
# Compute the min and max for each column grouped by the label
grouped = df_graph.groupby('label')
min_values = grouped.min()
max_values = grouped.max()
mean_values = grouped.mean()


def comp_range_plot():
    """
    Generate range plots for different features by labels.

    Returns
    -------
    Figure
        A matplotlib Figure object with the range plots.

    """
    fig, axes = plt.subplots(5, 5, figsize=(15, 2 * 5))
    axes = axes.flatten()  # Flatten the array of axes

    labels = [0, 1]
    colors = ['#205e55', '#8e9b8c']  # Different color for each label

    # Loop through each feature column to create a range bar
    for i, col in enumerate(df_graph.columns[:-1]):  # Exclude the label column
        ax = axes[i]
        bar_height = 0.5  # The height of the bars
        for j, label in enumerate(labels):
            # Plotting the range as a horizontal bar
            ax.barh(
                y=label,
                width=max_values[col][label] - min_values[col][label],
                left=min_values[col][label],
                height=bar_height,
                color=colors[j],
                edgecolor='black',
                label=f'Label {label}' if i % (len(labels) * 2) == 0 else ""
            )
            # Plotting the mean as a short horizontal line, inside the bar
            mean_value = mean_values[col][label]
            ax.plot(
                [mean_value, mean_value],  # X start and end of the line
                [label - bar_height / 2, label + bar_height / 2],
                color='black',  # Color of the mean line
                linestyle='--',  # Style of the line
                linewidth=2,  # Width of the line
                label=f'Mean for Label {label}' if i == 0 else ""
            )

        ax.set_title(f'Range for {col}')
        ax.set_yticks(labels)
        ax.set_yticklabels(labels)
        ax.set_ylim(min(labels) - bar_height, max(labels) + bar_height)

    plt.tight_layout()  # Adjust the layout
    return fig

In [110]:
ward_range = comp_range_plot();

In [148]:
def comp_box_plot():
    """
    Generate box plots for different features by labels.

    Returns
    -------
    Figure
        A matplotlib Figure object with the range plots.

    """
    fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(15, 2*5))

    axes = axes.flatten()  # Flatten the 2D array of axes for easy iteration

    # Assuming 'df_graph' is your DataFrame and the last column is 'label'
    feature_columns = df_graph.columns[:-1]  # Exclude the label column

    colors = ['#205e55', '#8e9b8c']
    # Loop through each feature column to create a box plot
    for i, col in enumerate(feature_columns):
        sns.boxplot(x='label', y=col, data=df_graph, ax=axes[i],
                    palette=colors,
                    hue='label', legend=False)
        axes[i].set_title(f'Values of {col}')
        axes[i].set_xlabel('Label')
        axes[i].set_ylabel('Value')
    plt.tight_layout()

    # Display the plot
    plt.show()