In [None]:
import math
import pandas as pd
from tkinter import Tk, filedialog
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.cluster import KMeans
from pyclustering.cluster.kmedoids import kmedoids #PAM
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KernelDensity
from sklearn.neighbors import NearestNeighbors

#from your_wavecluster_library import WaveCluster  # Replace with the actual import
import numpy as np
import pywt
import networkx as nx
from scipy.cluster.hierarchy import linkage, fcluster # For assign_labels()


from sklearn.metrics.pairwise import euclidean_distances # for CURE
from sklearn.preprocessing import StandardScaler
#from cure import cure  # You may need to install a library that implements CURE algorithm

from sklearn.neighbors import kneighbors_graph
from sklearn.cluster import AgglomerativeClustering

#Cluster Evaluation
from sklearn.metrics import silhouette_score
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.utils import check_random_state

#RS
import random
from numpy import genfromtxt
import copy
import timeit
from scipy.spatial import ConvexHull, distance
import collections

#Feature Selection
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, f_classif
#from sklearn.feature_selection import 

from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import manhattan_distances

#Filter Method
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist

#mRMR
#from skfeature.function.information_theoretical_based import MRMR
#from pymrmr import mRMR
from sklearn.feature_selection import mutual_info_classif

#S_Dbw
from sklearn.metrics import silhouette_score, davies_bouldin_score

<b>imports</b>

In [None]:
def Read_DataFrame(file_path):
    """
    Read an Excel file and convert it into a DataFrame.
    
    Parameters:
    file_path (str): Path to the Excel file.
    
    Returns:
    pandas.DataFrame: DataFrame containing the data from the Excel file.
    """
    try:
        # Read the Excel file into a DataFrame
        df = pd.read_excel(file_path)
        return df
    except Exception as e:
        print("Error:", e)
        return None

In [None]:
def choose_excel_file():
    """
    Open a file dialog to choose an Excel file.

    Returns:
    - str: Path to the selected Excel file.
    """
    root = Tk()
    root.withdraw()  # Hide the main window

    file_path = filedialog.askopenfilename(
        title="Select Excel file",
        filetypes=[("Excel files", "*.xlsx;*.xls")],
    )

    return file_path

In [None]:
file_path = choose_excel_file()

dataframe = Read_DataFrame(file_path)

if dataframe is not None:
    print("DataFrame created successfully.")
    print(dataframe.head())  # Display the first few rows of the DataFrame
else:
    print("Failed to create DataFrame.")

In [None]:
TC_ID_df = dataframe.copy()

dataframe = dataframe.drop(columns=['TC_ID'])

<b>Preprocessing</b>

In [None]:
def preprocess_data(dataframe):
    """
    Preprocess the DataFrame by encoding categorical columns.

    Parameters:
    - dataframe (pandas.DataFrame): Input DataFrame.

    Returns:
    - pandas.DataFrame: Processed DataFrame with numerical values.
    """
    le = LabelEncoder()
    for column in dataframe.columns:
        if dataframe[column].dtype == 'object':
            dataframe[column] = le.fit_transform(dataframe[column]).astype('int64')

    return dataframe

In [None]:
def fill_na_with_mean(dataframe):
    """
    Replace NaN or null values in a DataFrame with the mean of each column.

    Parameters:
    - dataframe: pandas DataFrame

    Returns:
    - DataFrame with NaN values replaced by mean
    """
    return dataframe.fillna(dataframe.mean()).astype('int64')

In [None]:
dataframe = preprocess_data(dataframe)
dataframe = fill_na_with_mean(dataframe)

In [None]:
def remove_single_value_columns(df):
    """
    Remove columns from a DataFrame that have only one unique value across all rows.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - Modified DataFrame with single-value columns removed
    """
    # Identify columns with only one unique value
    single_value_columns = df.columns[df.nunique() == 1]

    # Drop columns with only one unique value
    df = df.drop(single_value_columns, axis=1)

    return df

In [None]:
Non_Single_value_df = remove_single_value_columns(dataframe)

In [None]:
Non_Single_value_df.columns

In [None]:
Non_Single_value_df

In [None]:
# Reset the index and add it as a column
df_reset = Non_Single_value_df.reset_index()

In [None]:
df_reset

<b>PCA</b>

In [None]:
def apply_pca(dataframe, num_components=dataframe.shape[1]):
    """
    Apply Principal Component Analysis (PCA) to the input DataFrame.

    Parameters:
    - dataframe (pd.DataFrame): Input DataFrame.
    - num_components (int or None): Number of components to keep. If None, keeps all components.

    Returns:
    - pd.DataFrame: DataFrame containing PCA results.
    """

    # Extract features (X)
    X = dataframe.values

    # Apply PCA
    pca = PCA(n_components=num_components)
    pca_result = pca.fit_transform(X)

    # Get eigenvalues and indices
    eigenvalues = pca.explained_variance_
    print(type(eigenvalues))
    print(eigenvalues)
    indices = eigenvalues.argsort()[::-1]

    # Order the columns based on eigenvalues
    pca_columns = [f'PC{i + 1}' for i in range(num_components)]
    ordered_pca_columns = [pca_columns[i] for i in indices]
    pca_dataframe = pd.DataFrame(data=pca_result, columns=ordered_pca_columns)

    # Sort eigenvalues
    sorted_eigenvalues = eigenvalues[indices]

    return pca_dataframe, sorted_eigenvalues, pca


In [None]:
Non_Single_value_df.shape[1]

In [None]:
pca_list = list()
feature_weight_list = list()

# Fit a range of PCA models

for n in range(1, Non_Single_value_df.shape[1] + 1):
    
    # Create and fit the model
    PCAmod = PCA(n_components=n)
    PCAmod.fit(Non_Single_value_df)
    
    # Store the model and variance
    pca_list.append(pd.Series({'n':n, 'model':PCAmod,
                               'var': PCAmod.explained_variance_ratio_.sum()}))
    
    # Calculate and store feature importances
    abs_feature_values = np.abs(PCAmod.components_).sum(axis=0)
    feature_weight_list.append(pd.DataFrame({'n':n, 
                                             'features': Non_Single_value_df.columns,
                                             'values':abs_feature_values/abs_feature_values.sum()}))
    
pca_df = pd.concat(pca_list, axis=1).T.set_index('n')
pca_df

In [None]:
features_df = (pd.concat(feature_weight_list)
               .pivot(index='n', columns='features', values='values')) #Sum up all of the n

features_df

In [None]:
sns.set_context('talk')
ax = pca_df['var'].plot(kind='bar')

ax.set(xlabel='Number of dimensions',
       ylabel='Percent explained variance',
       title='Explained Variance vs Dimensions');

In [None]:
ax = features_df.plot(kind='bar', figsize=(13,8))
ax.legend(loc='upper right')
ax.set(xlabel='Number of dimensions',
       ylabel='Relative importance',
       title='Feature importance vs Dimensions');

In [None]:
Non_Single_value_df

In [None]:
#Remove Time.Wc
Non_Single_value_df = Non_Single_value_df.drop(columns=['Time.WC'])

In [None]:
Pdf = preprocess_data(Non_Single_value_df)
pca_result_df, eigenvalues, pca_model  = apply_pca(Pdf,Pdf.shape[1])

In [None]:
print(eigenvalues)

In [None]:
print(pca_model)

In [None]:
pca_result_df

In [None]:
pca_result_df_3 = pca_result_df.iloc[:,0:3]
pca_result_df_3

In [None]:
pca_result_df = pca_result_df_3

<b>Going Back to Original</b>

In [None]:
#Original
Non_Single_value_df

In [None]:
df_Origin = pca_model.inverse_transform(pca_result_df.values)

In [None]:
pd.DataFrame(df_Origin)

<b>WaveCluster</b>

In [None]:
def calculate_distance(dataframe, centroids, cluster_labels):
    distances = np.zeros(len(dataframe))

    for i, (label, row) in enumerate(zip(cluster_labels, dataframe.iterrows())):
        centroid = centroids[label]
        distances[i] = np.linalg.norm(row[1].values - centroid)

    return pd.Series(distances, name='Distance to Centroid')

In [None]:
len(pca_result_df)

In [None]:
'''In the second step, discrete wavelet transform will be applied on the quantized feature space. Applying wavelet transform on the units Mj results in a new feature space and
hence new units Tk. Given the set of units Tk, WaveCluster detects the connected components in the transformed feature space. Each connected component is a set of units Tk and is considered
as a cluster. Corresponding to each resolution r of wavelet transform, there would be a set of clusters Cr , where usually at the coarser resolutions, number of clusters is less.
In the experiments, we applied wavelet transform three times and tried Haar, Daubechies, Cohen-Daubechies-Feauveau ((4,2) and (2,2)) transforms [Vai93, SN96, URB97].
Average subbands (feature spaces) give approximations of the original feature space at different scales, which help in finding clusters at different levels of details. For example, as
shown in Figure 5, for a 2-dimensional feature space, the subbands LL show the clusters at different scales. We use the algorithm in [Hor88] to find the connected components in
the 2-dimensional feature space (image). The same concept can be generalized for higher dimensions. Figure 12 in Section 5, shows the clusters that WaveCluster found at each scale
in different colors.'''

def apply_wavelet_transform(data, levels):
    # Choose a wavelet and apply the wavelet transform
    wavelet = 'db1'  # Replace with the desired wavelet
    coefficients = pywt.wavedec(data, wavelet, level=levels)
    
    # Flatten the coefficients to a 1D array for simplicity
    flattened_coefficients = np.concatenate([c.flatten() for c in coefficients])
    
    return flattened_coefficients

In [None]:
def find_clusters(wavelet_transformed_data, threshold):
    # This can be done using connected components or any clustering algorithm
    # based on the application. Here, we use a simple threshold.
    return (wavelet_transformed_data > threshold).astype(int)

In [None]:
def assign_labels(clusters):
    unique_elements = np.unique(clusters)
    print(unique_elements)
    labels = np.searchsorted(unique_elements, clusters)

    return labels

In [None]:
def make_lookup_table(labels):
    # Create a lookup table for quick access
    unique_labels = np.unique(labels)
    lookup_table = {label: np.where(labels == label)[0] for label in unique_labels}
    return lookup_table

In [None]:
def map_objects_to_clusters(data, lookup_table):
    clustered_objects = {}
    for label, indices in lookup_table.items():
        # Ensure indices are within the valid range
        valid_indices = indices[indices < len(data)]
        clustered_objects[label] = data[valid_indices]
    return clustered_objects

In [None]:
def wave_cluster(data, quantization_factor=0.1, wavelet_levels=3, threshold=0.5, num_clusters=3):
    # Step 1: Quantize feature space and assign objects to units
    quantized_data = np.floor(data / quantization_factor)

    # Step 2: Apply wavelet transform on the feature space
    wavelet_transformed_data = apply_wavelet_transform(quantized_data, wavelet_levels)

    # Step 3: Find connected components (clusters) in the subbands of the transformed feature space
    clusters = find_clusters(wavelet_transformed_data, threshold)

    # Step 4: Assign labels to the units
    labels = assign_labels(clusters)

    # Step 5: Make the lookup table
    lookup_table = make_lookup_table(labels)

    # Step 6: Map the objects to the clusters
    clustered_objects = map_objects_to_clusters(data, lookup_table)

    # Step 7: Calculate the center of each cluster
    cluster_centers = []
    for label, cluster in clustered_objects.items():
        if cluster.size > 0:  # Check if the cluster is not empty
            center = np.mean(cluster, axis=0)
            cluster_centers.append(center)

    # Step 7: Extract cluster labels for each row in the DataFrame
    wave_cluster_labels = []
    for idx, row in enumerate(data):
        for label, cluster_objects in clustered_objects.items():
            if any(np.array_equal(row, obj) for obj in cluster_objects):
                wave_cluster_labels.append(label)
                break
            

    return clustered_objects, wave_cluster_labels, cluster_centers

In [None]:
# Apply the wave_cluster function to the DataFrame
result, wave_cluster_labels, cluster_centers = wave_cluster(pca_result_df.values)

# Display the clustered objects
for label, objects in result.items():
    print(f'Cluster {label}:')
    for obj in objects:
        print(obj)
    print('\n')

In [None]:
'''
import matplotlib.pyplot as plt
import pywt
import numpy as np
import random
from math import *

def scale_01_data(rawData):
    # normalize the raw dataset
    dim = rawData.shape[1]  
    # the rawData has at least 2 raw, 1 for signal 1 for label
    minList = [np.amin(rawData[:,x]) for x in range(0, dim)]
    maxList = [np.amax(rawData[:,x])+0.001 for x in range(0, dim)] 
    # add the [0] and [1] because there is a 'row of label', and 0.001 to avoid 1
    toZero = rawData - np.array(minList)
    normData = toZero / (np.array(maxList) - np.array(minList))
    return(normData)

def map2ScaleDomain(dataset, scale=128):
    # map the dataset into scale domain for wavelet transform
    if scale <= 0 or not(isinstance(scale, int)):
        raise ValueError('scale must be a positive integer')
    dim = dataset.shape[1]
    length = dataset.shape[0]
    sd_data = {}
    for i in range(0, length):
        num = 0
        for j in reversed(range(0, dim)):     # start from the most weighted dimension
            num += (dataset[i,j]//(1/scale))*pow(scale, j)  # let the numbering start from '0'!
        num = int(num)
        if sd_data.get(num, 'N/A')=='N/A':
            sd_data[num] = 1
        else:
            sd_data[num] += 1
    return sd_data

def ndWT(data, dim, scale, wave):
    # calculate 1 order n dimensional wavelet transform with numbered grids
    wavelets = {'db1':[0.707, 0.707], 'bior1.3':[-0.09, 0.09, 0.707, 0.707, 0.09, -0.09], \
                'db2':[-0.13, 0.224, 0.836, 0.483]}
    lowFreq = {}
    convolutionLen = len(wavelets.get(wave))-1
    lineLen = ceil(scale/2) + ceil((convolutionLen-2)/2)
    for inDim in range(0, dim):
        for key in data.keys():
            coordinate = [] # coordinate start from 0
            tempkey = key
            for i in range(0, dim):
                # get the coordinate for a numbered grid
                if i <= dim-inDim-1:
                    coordinate.append(tempkey//pow(scale, (dim-1-i)))
                    tempkey = tempkey%pow(scale, (dim-1-i))
                else:
                    coordinate.append(tempkey//pow(lineLen, (dim-1-i)))
                    tempkey = tempkey%pow(lineLen, (dim-1-i))
            coordinate.reverse()
            startCoord = ceil((coordinate[inDim]+1)/2)-1    # to calculate ndwt, signal should start from 1, temporarily convert
            startNum = 0    # numbered label for next level of data
            for i in range(0, dim):
                if i <= inDim:
                    if i == inDim:
                        startNum += startCoord*pow(lineLen, i)
                    else:
                        startNum += coordinate[i]*pow(lineLen, i)
                else:
                    startNum += coordinate[i]*pow(scale, i)
            wavelet = wavelets.get(wave)   # for convolution
            for i in range(0, convolutionLen//2+1):  
                if startCoord+i >= lineLen: # coordinate start from 0 
                    break
                if lowFreq.get(int(startNum+pow(lineLen, inDim)*i), 'N/A') == 'N/A':
                    lowFreq[int(startNum+pow(lineLen, inDim)*i)] = \
                            data[key]*wavelet[int((startCoord+1+i)*2-(coordinate[inDim]+1))]
                else:
                    lowFreq[int(startNum+pow(lineLen, inDim)*i)] += \
                            data[key]*wavelet[int((startCoord+1+i)*2-(coordinate[inDim]+1))]
        data = lowFreq
        lowFreq = {}
    return data

class node():
    def __init__(self, key=0, value=0):
        self.key = key
        self.value = value
        self.process = False
        self.cluster = None

    def around(self, scale=1, dim=1):
        aroundNodeKey = []
        coordinate = []
        for inDim in range(0, dim):
            # we can't afford diagonal searching
            dimCoord = self.key // pow(scale, inDim)
            if dimCoord == 0:
                aroundNodeKey.append(self.key + pow(scale, inDim))
            elif dimCoord == scale-1:
                aroundNodeKey.append(self.key - pow(scale, inDim))
            else:
                aroundNodeKey.append(self.key + pow(scale, inDim))
                aroundNodeKey.append(self.key - pow(scale, inDim))
        return aroundNodeKey

def bfs(equal_pair, maxQueue):
    if equal_pair == []:
        return equal_pair
    group = {x:[] for x in range(1, maxQueue)}
    result = []
    for x, y in equal_pair:
        group[x].append(y)
        group[y].append(x)
    for i in range(1, maxQueue):
        if i in group:
            if group[i] == []:
                del group[i]
            else:
                queue = [i]
                for j in queue:
                    if j in group:
                        queue += group[j]
                        del group[j]
                record = list(set(queue))
                record.sort()
                result.append(record)
    return result

def build_key_cluster(nodes, equal_list, cutMiniCluster):
    cluster_key = {}
    for point in nodes.values():
        flag = 0
        for cluster in equal_list:
            if point.cluster in cluster:
                point.cluster = cluster[0]
                if cluster_key.get(cluster[0], 'N/A') == 'N/A':
                    cluster_key[cluster[0]] = [point]
                    flag = 1
                else:
                    cluster_key[cluster[0]].append(point)
                    flag = 1
                break
        if flag == 0:
            if cluster_key.get(point.cluster, 'N/A') == 'N/A':
                cluster_key[point.cluster] = [point]
            else:
                cluster_key[point.cluster].append(point)
    count = 1
    result = {}
    for cluster in cluster_key.keys():
        if len(cluster_key[cluster]) == 1:
            if cluster_key[cluster][0].value < cutMiniCluster:
                continue
        for p in cluster_key[cluster]:
            result[p.key] = count
        count += 1
    return result

def clustering(data, scale, dim, cutMiniCluster):
    equal_pair = []
    cluster_flag = 1
    for point in data.values():
        point.process = True
        for around in point.around(scale, dim):
            if not (data.get(around, 'N/A') == 'N/A'):
                around = data.get(around)
                if around.cluster is not None:
                    if point.cluster is None:
                        point.cluster = around.cluster
                    elif point.cluster != around.cluster:
                        mincluster = min(point.cluster, around.cluster)
                        maxcluster = max(point.cluster, around.cluster)
                        equal_pair += [(mincluster, maxcluster)]
        if point.cluster is None:
            point.cluster = cluster_flag
            cluster_flag += 1

    equal_pair = set(equal_pair)
    equal_list = bfs(equal_pair, cluster_flag)
    result = build_key_cluster(data, equal_list, cutMiniCluster)
    return result

def thresholding(data, threshold, scale, dim):
    nodes = {}
    result = {}
    startNode = node(0)
    avg = 0
    for key, value in data.items():
        if value >= threshold:
            nodes[key] = node(key, value)
            avg += value
            if value > startNode.value:
                startNode = node(key, value)
    cutMiniCluster = avg / len(nodes)
    clusters = clustering(nodes, scale, dim, cutMiniCluster)
    return clusters

def findThreshold(data, threshold):
    value = list(data.values())
    value.sort(reverse=True)
    x = [i for i in range(1, len(value) + 1)]
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.scatter(x, value)
    ax.axhline(y=threshold, xmin=0, xmax=1, color='r')
    plt.show()

def markData(normData, cluster, scale):
    dim = normData.shape[1]
    tags = []
    for point in range(0, normData.shape[0]):
        number = 0
        for inDim in range(0, dim):
            number += (normData[point, inDim] // (1 / scale)) * pow(scale, inDim)
        if cluster.get(int(number), 'N/A') == 'N/A':
            tags.append(0)
        else:
            tags.append(cluster.get(int(number)))
    return tags

def waveCluster(data, scale=50, wavelet='db2', threshold=0.5, plot=False):
    waveletlen = {'db1': 0, 'db2': 1, 'bior1.3': 2}
    normData = scale_01_data(data)
    dim = normData.shape[1]
    dataDic = map2ScaleDomain(normData, scale)
    dwtResult = ndWT(dataDic, dim, scale, wavelet)
    if plot: findThreshold(dwtResult, threshold)
    lineLen = scale // 2 + waveletlen.get(wavelet)
    result = thresholding(dwtResult, threshold, lineLen, dim)
    tags = markData(normData, result, lineLen)
    
    # Finding cluster centers
    clustered_objects = {}
    for key, value in result.items():
        if value not in clustered_objects:
            clustered_objects[value] = []
        clustered_objects[value].append(normData[key])

    cluster_centers = [np.mean(cluster, axis=0) for cluster in clustered_objects.values()]
    
    return clustered_objects, tags, cluster_centers

def draw2Darray(x, y, tag):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    color = tag / np.amax(tag)
    rgb = plt.get_cmap('jet')(color)
    ax.scatter(x, y, color=rgb)
    plt.show()

def heatmap2D(data, lineLen):
    intensity = np.zeros((lineLen, lineLen))
    x = list(range(0, lineLen))
    y = x
    for key in data.keys():
        xIn = key % (lineLen) 
        yIn = key // (lineLen) 
        intensity[int(xIn), int(yIn)] = data.get(key)
    x, y = np.meshgrid(x, y)
    plt.pcolormesh(x, y, intensity.T)
    plt.colorbar()
    plt.show()
'''

In [None]:
def calculate_wcvr(data, labels, centers):
    """
    Calculate the Within-Cluster Variance Ratio (WCVR) for ICPS index.

    Parameters:
    - data: pandas DataFrame, input data
    - labels: array-like, cluster labels assigned to each data point
    - centers: numpy array, cluster centers

    Returns:
    - wcvr: float, Within-Cluster Variance Ratio
    """
    num_clusters = len(np.unique(labels))
    total_wcv = 0

    for cluster_label in range(num_clusters):
        if cluster_label in labels and cluster_label < len(centers):
            cluster_points = data.loc[labels == cluster_label].values

            within_cluster_variance = np.mean(np.sum((cluster_points - centers[cluster_label]) ** 2, axis=1))
            total_wcv += within_cluster_variance

    wcvr = total_wcv / num_clusters

    return wcvr

In [None]:
def calculate_ICPS_index(data, labels, centers):
    """
    Calculate the ICPS index for clustering validation.

    Parameters:
    - data: pandas DataFrame, input data
    - labels: array-like, cluster labels assigned to each data point
    - centers: numpy array, cluster centers

    Returns:
    - ICPS_index: float, ICPS index value
    """
    data_array = data.values  # Convert DataFrame to numpy array
    try:
        silhouette_avg = silhouette_score(data_array, labels)
    except ValueError:
        print("Only 1 cluster -> ICPS is not possible")

    db_index = davies_bouldin_score(data_array, labels)
    wcvr = calculate_wcvr(data, labels, centers)

    ICPS_index = (db_index + (1 - silhouette_avg) + wcvr) / 3

    return ICPS_index

In [None]:
# Initialize an empty list to store boolean arrays for each cluster
cluster_masks = []

# Create boolean arrays for each cluster
for label in wave_cluster_labels:
    cluster_mask = np.array([l == label for l in wave_cluster_labels])
    cluster_masks.append(cluster_mask)

# Calculate ICPS Index for each cluster
for label, cluster_mask in zip(wave_cluster_labels, cluster_masks):
    ICPS_index = calculate_ICPS_index(pca_result_df, cluster_mask, cluster_centers)
    print(f"ICPS Index for Cluster {label}: {ICPS_index}")

In [None]:
set_wave_cluster_labels = set(wave_cluster_labels)
set_wave_cluster_labels

In [None]:
cluster_centers

In [None]:
# Convert arrays to lists and cast each element to an integer
arr_list = [list(map(float, arr)) for arr in cluster_centers]
arr_list

In [None]:
np.array(list(wave_cluster_labels))

In [None]:
ICPS_index = calculate_ICPS_index(pca_result_df, np.array(wave_cluster_labels), arr_list)
print("ICPS Index:", ICPS_index)

<b>Get Original Data</b>

In [None]:
def get_original_data(pca_result_df, centroids, pca_model):
    """
    Get the original data from the centroids and the inverted DataFrame from applying PCA.

    Parameters:
    - pca_result_df (pandas.DataFrame): DataFrame containing PCA results.
    - centroids (numpy.ndarray): Array containing the centroids of each cluster.
    - pca_model (sklearn.decomposition.PCA): Fitted PCA model.

    Returns:
    - pandas.DataFrame: DataFrame containing the original data.
    """

    # Invert PCA transformation to get original data
    original_data = pca_model.inverse_transform(pca_result_df.values)

    # Convert the array back to a DataFrame
    original_data_df = pd.DataFrame(data=original_data, columns=pca_result_df.columns)

    # Add centroids to the DataFrame
    original_centroids = pca_model.inverse_transform(centroids)
    centroids_df = pd.DataFrame(data=original_centroids, columns=pca_result_df.columns)
    #original_data_with_centroids_df = pd.concat([original_data_df, centroids_df])

    return original_data_df, centroids_df

#pca_model = pca_model
# Example usage
original_data_df, original_centroids_df = get_original_data(pca_result_df, arr_list, pca_model)

In [None]:
original_data_df

In [None]:
original_centroids_df

In [None]:
zip(wave_cluster_labels, pca_result_df.iterrows())

In [None]:
len(wave_cluster_labels)

In [None]:
# Zip 'Cluster Labels' with pca_result_df.iterrows()
zipped_results = zip(wave_cluster_labels, pca_result_df.iterrows())
sum = 0

# Display the results
for cluster_label, (index, row) in zipped_results:
    sum+= 1
    print(f'Cluster Label: {cluster_label}, Index: {index}, Row Values: {row.values}')


In [None]:
# Zip 'Cluster Labels' with pca_result_df.iterrows()
zipped_results = zip(wave_cluster_labels, pca_result_df.iterrows())

# Collect the results into a list
data_list = []
for cluster_label, (index, row) in zipped_results:
    data_list.append({'Cluster Label': cluster_label, 'Index': index, 'Row Values': row.values})

# Create a DataFrame from the list
Checking_df = pd.DataFrame(data_list)

# Remove duplicate rows based on 'Cluster Label' and 'Index'
Checking_df.drop_duplicates(subset=['Cluster Label', 'Index'], inplace=True)


In [None]:
print("DataFrame with Removed Duplicates:")
Checking_df

In [None]:
len(Checking_df['Row Values'][0])

In [None]:
# Apply PCA to the original DataFrame

# Calculate the distance between data points and their cluster centroids in the PCA space
distance_df = calculate_distance(pca_result_df, arr_list, pd.Series(wave_cluster_labels, name='Cluster Labels'))

# Combine the original DataFrame with the PCA result, cluster labels, and distance
Final_result_df = pd.concat([pca_result_df, pd.Series(wave_cluster_labels, name='Cluster Labels'), distance_df], axis=1)

In [None]:
Final_result_df

In [None]:
min_distance_indices = Final_result_df.groupby('Cluster Labels')['Distance to Centroid'].idxmin()

# Extract the corresponding rows from the DataFrame
min_distance_rows = Final_result_df.loc[min_distance_indices]

# Reset the index and name the index column as 'TC'
min_distance_rows.reset_index(inplace=True)
min_distance_rows.rename(columns={'index': 'TC'}, inplace=True)


In [None]:
kd = pd.DataFrame(min_distance_rows)
kd

In [None]:
kd['TC']

In [None]:
selected_rows = Non_Single_value_df.iloc[kd['TC']]

print(selected_rows)


In [None]:
selected_rows_pd = pd.DataFrame(selected_rows.reset_index(drop=True))
selected_rows_pd_Explicit_Minimum_to_centroids = pd.DataFrame(selected_rows)

In [None]:
selected_rows_pd_Explicit_Minimum_to_centroids

<b>Selected Rows</b>

In [None]:
selected_rows_pd

In [None]:
pca_model

In [None]:
selected_rows_indices = selected_rows_pd_Explicit_Minimum_to_centroids.iloc[:, 0].tolist()
selected_pdfs = Pdf.iloc[selected_rows_indices]

In [None]:
selected_pdfs

In [None]:
original_data_df

In [None]:
import os

excel_file_path = ''
# Ensure the directory exists, create it if necessary
output_directory = os.path.dirname(excel_file_path)
os.makedirs(output_directory, exist_ok=True)

# Save the Pandas DataFrame as an Excel file

# Save the Pandas DataFrames as an Excel file with two sheets
with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:
    # Save the first DataFrame to the first sheet (Sheet1)
    selected_rows_pd.to_excel(writer, sheet_name='Sheet1', index=False)

In [None]:
Final_result_df

<b>Plotting using First 3 columns in PCA Dataframe, Cluster label for each PCA and Distance to Centroid</b>

In [None]:
pd.Series(wave_cluster_labels).unique()

In [None]:
# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Assign unique colors to clusters
colors = plt.cm.get_cmap('viridis', len(Final_result_df['Cluster Labels'].unique()))

# Define unique marker styles for each cluster
marker_styles = ['o', 's', 'D', '^', 'v', 'p', '*', 'h']

# Scatter plot for each cluster
for cluster_label in Final_result_df['Cluster Labels'].unique():
    cluster_data = Final_result_df[Final_result_df['Cluster Labels'] == cluster_label]
    ax.scatter(cluster_data['PC1'], cluster_data['PC2'], cluster_data['PC3'], label=f'Cluster {cluster_label}', c=[colors(cluster_label)], marker=marker_styles[cluster_label])

# Plot centroids
for i, (cluster_label, centroid) in enumerate(zip(pd.Series(wave_cluster_labels).unique(), np.array(arr_list))):
    ax.scatter(centroid[0], centroid[1], centroid[2], marker='x', s=200, label=f'Centroid {cluster_label}', c=[colors(i)])#[colors(cluster_label)])

cluster_numbers = len(Final_result_df['Cluster Labels'].unique())  # Number of clusters

# Set labels
ax.set_xlabel('PC1', labelpad=20)
ax.set_ylabel('PC2', labelpad=20)
#ax.set_zlabel('PC3')
ax.set_zlabel('PC3', labelpad=20)  # Adjust the labelpad to move the label away from the axis
ax.set_title('3D Scatter Plot of Clusters and Centroids')
#ax.legend()
# Move legend to top left and make it smaller
ax.legend(loc='upper left', bbox_to_anchor=(0, 1), prop={'size': 8})

# Format the filename with the number of clusters
path_to_image = ''
plt.savefig(path_to_image)

# Show the plot
plt.show()

In [None]:
# Create a 1D scatter plot
fig, ax = plt.subplots(figsize=(10, 8))

# Calculate distances of each data point from each centroid
distances = []
for centroid in np.array(arr_list):
    distance = np.linalg.norm(pca_result_df[['PC1']].values - centroid, axis=1)
    distances.append(distance)

# Assign colors based on the closest centroid
colors = np.argmin(distances, axis=0)

# Scatter plot with colored data points
scatter = ax.scatter(pca_result_df['PC1'], np.zeros_like(pca_result_df['PC1']), c=colors, cmap='viridis')

# Plot centroids
for centroid in np.array(arr_list):
    ax.scatter(centroid[0], 0, marker='x', s=100, color='black')

# Set labels
ax.set_xlabel('PC1')
ax.set_title('1D Scatter Plot of PC1 with Colored Data Points')
ax.legend()

# Add colorbar
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Cluster')

# Save the plot as an image
Path_to_Image = ''
plt.savefig(Path_to_Image)

# Show the plot
plt.show()

<b> Save to Excel</b>

In [None]:
import os

excel_file_path = ''
# Ensure the directory exists, create it if necessary
output_directory = os.path.dirname(excel_file_path)
os.makedirs(output_directory, exist_ok=True)

# Save the Pandas DataFrames as an Excel file with two sheets
with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:
    # Save the first DataFrame to the first sheet (Sheet1)
    Non_Single_value_df.to_excel(writer, sheet_name='Original_Data', index=False)

    # Save the second DataFrame to the second sheet (Sheet2)
    original_data_df.to_excel(writer, sheet_name='Original_data_back_from_PCA', index=False, startrow=0)

    selected_rows_pd_Explicit_Minimum_to_centroids.to_excel(writer, sheet_name='TCs_With_min_Dist_to_Centroids', index=False, startrow=0)

    original_centroids_df.to_excel(writer, sheet_name='Centroids_back_from_PCA', index=False, startrow=0)

In [None]:
Non_Single_value_df.columns

In [None]:
Non_Single_value_df

In [None]:
original_data_df

In [None]:
Original_selected_rows_pd_Explicit_Minimum_to_centroids = selected_rows_pd_Explicit_Minimum_to_centroids.copy()

In [None]:
selected_rows_pd_Explicit_Minimum_to_centroids

In [None]:
original_centroids_df

In [None]:
# Create a dictionary mapping index values to corresponding column names
new_TC_ID_columns_dict = {i: TC_ID_df.at[i, 'TC_ID'] for i in TC_ID_df.index if i in selected_rows_pd_Explicit_Minimum_to_centroids.index}

# Print the dictionary
new_TC_ID_columns_dict

In [None]:
Trpose = Original_selected_rows_pd_Explicit_Minimum_to_centroids.transpose().copy()
Trpose

In [None]:
# Rename columns
Trpose = Trpose.rename(columns=new_TC_ID_columns_dict)
Trpose