In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import OPTICS, cluster_optics_dbscan
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import euclidean
import plotly.graph_objs as go
import seaborn as sns
plt.ioff();

In [19]:
%run 'Lab_preprocess.ipynb'

In [23]:
test = country_pca.iloc[:, 0:2]

In [24]:
def get_kdist(k, data):
    """
    Calculate the kth smallest Euclidean distances for each point in a dataset.

    Parameters
    ----------
    k : int
        The index of the nearest neighbor to consider
    data : array_like
        A 2D array where each row represents a data point and 
        columns are features.

    Returns
    -------
    list
        A list of the kth smallest distances for each point in the dataset,
        sorted in descending order.

    """
    k_distances = []
    for i in range(len(data)):
        distances = []
        for j in range(len(data)):
            if i != j:
                dist = euclidean(data[i], data[j])
                distances.append(dist)
        distances.sort()
        k_distances.append(distances[k-1])
    return sorted(k_distances, reverse=True)

In [46]:
def db_dist():
    """
    Plot the kth smallest distances for a range of k values from a dataset.

    """
    fig, axs = plt.subplots(5, 1, figsize=(10, 15))  # 5 rows, 1 column

    for i, k in enumerate(range(2, 7)):
        # Calculate the distances for each k
        distances = get_kdist(k, test.values)

        # Plot the distances in a subplot
        axs[i].plot(distances, color='#205e55')
        axs[i].set_title(f'k = {k}')
        axs[i].set_xlabel('Point')
        axs[i].set_ylabel('Distance')

    plt.tight_layout()  # Adjust the layout
    plt.show()

In [48]:
dbscan = DBSCAN(eps=38, min_samples=11)
cluster_labels = dbscan.fit_predict(country_pca)


def db_scatter():
    """
    Generate a scatter plot of clustered data with custom color mapping.

    Returns
    -------
    Figure
        A matplotlib Figure object containing the scatter plot.

    """

    custom_colors = ['#205e55', '#8e9b8c']
    fig, ax = plt.subplots(figsize=(10, 6))
    # Create a list of colors for each label
    color_mapped = [custom_colors[label] for label in cluster_labels]

    plt.scatter(country_pca.iloc[:, 0], country_pca.iloc[:, 1],
                c=color_mapped)
    # plt.gca().set_aspect("equal")
    plt.tight_layout()
    return fig

In [42]:
dbscatter = db_scatter()

In [11]:
def plotting(x, y, z, labels):
    """
    Generate a 3D scatter plot of data points with custom color mapping.

    Parameters
    ----------
    x : array_like
        The x-coordinates of the data points.
    y : array_like
        The y-coordinates of the data points.
    z : array_like
        The z-coordinates of the data points.
    labels : array_like
        Cluster labels for each data point.

    Returns
    -------
    Figure
        A Plotly Figure object containing the 3D scatter plot.

    """
    x = x
    y = y
    z = z

    custom_colors = ['#205e55', 'black']

    # Create a list of colors for each label
    color_mapped = [custom_colors[label] for label in labels]

    trace_data = go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers',
        marker=dict(
            size=8,
            color=color_mapped,  # set color to an array/list of desired values
            colorscale='Viridis',  # choose a colorscale
            opacity=0.8
        )
    )

    data = [trace_data]

    layout = go.Layout(
        margin=dict(l=0, r=0, b=0, t=0),  # tight layout
        title='K-Medoids: Scatter Plot',
        scene=dict(
            xaxis=dict(title='PCA 1'),
            yaxis=dict(title='PCA 2'),
            zaxis=dict(title='PCA 3')
        )
    )

    fig = go.Figure(data=data, layout=layout)

    # Render the plot
    return fig

In [12]:
db_3d = plotting(country_pca.iloc[:, 0], country_pca.iloc[:, 1],
                 country_pca.iloc[:, 2], cluster_labels)

In [13]:
df_final['label'] = cluster_labels
df_graph = df_final.iloc[:, 2:]

In [50]:
grouped = df_graph.groupby('label')
min_values = grouped.min()
max_values = grouped.max()
mean_values = grouped.mean()


def db_range_plot():
    """
    Generate range plots for different features by labels.

    Returns
    -------
    Figure
        A matplotlib Figure object with the range plots.

    """
    fig, axes = plt.subplots(5, 5, figsize=(15, 2 * 5))
    axes = axes.flatten()  # Flatten the array of axes

    labels = [-1, 0]
    colors = ['#205e55', '#8e9b8c']  # Different color for each label

    # Loop through each feature column to create a range bar
    for i, col in enumerate(df_graph.columns[:-1]):  # Exclude the label column
        ax = axes[i]
        bar_height = 0.5  # The height of the bars
        for j, label in enumerate(labels):
            # Plotting the range as a horizontal bar
            ax.barh(
                y=label,
                width=max_values[col][label] - min_values[col][label],
                left=min_values[col][label],
                height=bar_height,
                color=colors[j],
                edgecolor='black',
                label=f'Label {label}' if i % (len(labels) * 2) == 0 else ""
            )
            # Plotting the mean as a short horizontal line, inside the bar
            mean_value = mean_values[col][label]
            ax.plot(
                [mean_value, mean_value],  # X start and end of the line
                [label - bar_height / 2, label + bar_height / 2],
                color='black',  # Color of the mean line
                linestyle='--',  # Style of the line
                linewidth=2,  # Width of the line
                label=f'Mean for Label {label}' if i == 0 else ""
            )

        ax.set_title(f'Range for {col}')
        ax.set_yticks(labels)
        ax.set_yticklabels(labels)
        ax.set_ylim(min(labels) - bar_height, max(labels) + bar_height)

    plt.tight_layout()  # Adjust the layout
    return fig

In [44]:
db_range = db_range_plot()

In [15]:
def db_box_plot():
    """
    Generate box plots for different features by labels.

    Returns
    -------
    Figure
        A matplotlib Figure object with the range plots.

    """
    fig, axes = plt.subplots(nrows=9, ncols=3, figsize=(15, 3*13))

    axes = axes.flatten()  # Flatten the 2D array of axes for easy iteration

    # Assuming 'df_graph' is your DataFrame and the last column is 'label'
    feature_columns = df_graph.columns[:-1]  # Exclude the label column

    colors = ['#205e55', '#8e9b8c']
    # Loop through each feature column to create a box plot
    for i, col in enumerate(feature_columns):
        sns.boxplot(x='label', y=col, data=df_graph, ax=axes[i],
                    palette=colors,
                    hue='label', legend=False)
        axes[i].set_title(f'Boxplot of {col}')
        axes[i].set_xlabel('Label')
        axes[i].set_ylabel('Value')

    # So we remove it or make it invisible
    if len(df_graph.columns) % 3 != 0:
        fig.delaxes(axes[-1])
        fig.delaxes(axes[-2])

    # Adjust the layout
    plt.tight_layout()

    # Display the plot
    plt.show()