In [None]:
import math
import pandas as pd
from tkinter import Tk, filedialog
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.cluster import KMeans
from pyclustering.cluster.kmedoids import kmedoids #PAM
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KernelDensity
from sklearn.neighbors import NearestNeighbors

#from your_wavecluster_library import WaveCluster  # Replace with the actual import
import numpy as np
import pywt
import networkx as nx
from scipy.cluster.hierarchy import linkage, fcluster # For assign_labels()


from sklearn.metrics.pairwise import euclidean_distances # for CURE
from sklearn.preprocessing import StandardScaler
#from cure import cure  # You may need to install a library that implements CURE algorithm

from sklearn.neighbors import kneighbors_graph
from sklearn.cluster import AgglomerativeClustering

#Cluster Evaluation
from sklearn.metrics import silhouette_score
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.utils import check_random_state

#RS
import random
from numpy import genfromtxt
import copy
import timeit
from scipy.spatial import ConvexHull, distance
import collections

#Feature Selection
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, f_classif
#from sklearn.feature_selection import 

from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import manhattan_distances

#Filter Method
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist

#mRMR
#from skfeature.function.information_theoretical_based import MRMR
#from pymrmr import mRMR
from sklearn.feature_selection import mutual_info_classif

#S_Dbw
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [None]:
def Read_DataFrame(file_path):
    """
    Read an Excel file and convert it into a DataFrame.
    
    Parameters:
    file_path (str): Path to the Excel file.
    
    Returns:
    pandas.DataFrame: DataFrame containing the data from the Excel file.
    """
    try:
        # Read the Excel file into a DataFrame
        df = pd.read_excel(file_path)
        return df
    except Exception as e:
        print("Error:", e)
        return None

In [None]:
def choose_excel_file():
    """
    Open a file dialog to choose an Excel file.

    Returns:
    - str: Path to the selected Excel file.
    """
    root = Tk()
    root.withdraw()  # Hide the main window

    file_path = filedialog.askopenfilename(
        title="Select Excel file",
        filetypes=[("Excel files", "*.xlsx;*.xls")],
    )

    return file_path

In [None]:
file_path = choose_excel_file()

dataframe = Read_DataFrame(file_path)

if dataframe is not None:
    print("DataFrame created successfully.")
    print(dataframe.head())  # Display the first few rows of the DataFrame
else:
    print("Failed to create DataFrame.")

In [None]:
TC_ID_df = dataframe.copy()

dataframe = dataframe.drop(columns=['TC_ID'])

In [None]:
def preprocess_data(dataframe):
    """
    Preprocess the DataFrame by encoding categorical columns.

    Parameters:
    - dataframe (pandas.DataFrame): Input DataFrame.

    Returns:
    - pandas.DataFrame: Processed DataFrame with numerical values.
    """
    le = LabelEncoder()
    for column in dataframe.columns:
        if dataframe[column].dtype == 'object':
            dataframe[column] = le.fit_transform(dataframe[column]).astype('int64')

    return dataframe

In [None]:
def fill_na_with_mean(dataframe):
    """
    Replace NaN or null values in a DataFrame with the mean of each column.

    Parameters:
    - dataframe: pandas DataFrame

    Returns:
    - DataFrame with NaN values replaced by mean
    """
    return dataframe.fillna(dataframe.mean()).astype('int64')

In [None]:
dataframe = preprocess_data(dataframe)
dataframe = fill_na_with_mean(dataframe)

In [None]:
def remove_single_value_columns(df):
    """
    Remove columns from a DataFrame that have only one unique value across all rows.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - Modified DataFrame with single-value columns removed
    """
    # Identify columns with only one unique value
    single_value_columns = df.columns[df.nunique() == 1]

    # Drop columns with only one unique value
    df = df.drop(single_value_columns, axis=1)

    return df

In [None]:
dataframe = remove_single_value_columns(dataframe)

In [None]:
dataframe = dataframe.drop(columns=['Time.WC'])

In [None]:
# Apply PCA
pca = PCA(n_components=3)
dataframe = pca.fit_transform(dataframe)

In [None]:
# Get eigenvalues and indices
eigenvalues = pca.explained_variance_
print(type(eigenvalues))
print(eigenvalues)
indices = eigenvalues.argsort()[::-1]

# Order the columns based on eigenvalues
pca_columns = [f'PC{i + 1}' for i in range(3)]
ordered_pca_columns = [pca_columns[i] for i in indices]
dataframe = pd.DataFrame(data=dataframe, columns=ordered_pca_columns)

In [None]:
dataframe

In [None]:
# Summary statistics
summary_stats = dataframe.describe()
summary_stats

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histograms
dataframe.hist(bins=10, figsize=(10, 5))
plt.suptitle('Histograms of Signals')
plt.show()

# Box plots
dataframe.plot(kind='box', subplots=True, layout=(1, len(dataframe.columns)), figsize=(10, 5), title='Box Plots of Signals')
plt.show()

In [None]:
# Correlation matrix
correlation_matrix = dataframe.corr()
print(correlation_matrix)

# Heatmap of the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
from scipy.stats import shapiro

# Shapiro-Wilk test for normality
for column in dataframe.columns:
    stat, p = shapiro(dataframe[column])
    #print(f'Signal: {column}, Statistics={stat}, p={p}')
    if p > 0.05:
        print(f'{column} looks Gaussian (fail to reject H0)')
    else:
        print(f'{column} does not look Gaussian (reject H0)')

In [None]:
from scipy.stats import zscore

# Z-score method
z_scores = zscore(dataframe)
abs_z_scores = abs(z_scores)
outliers = (abs_z_scores > 3).any(axis=1)
print(f'Outliers detected: {dataframe[outliers]}')

# IQR method
Q1 = dataframe.quantile(0.25)
Q3 = dataframe.quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)
print(f'Outliers detected using IQR: {dataframe[outliers_iqr]}')

In [None]:
dataframe[outliers_iqr]

In [None]:
# Check for missing values
missing_values = dataframe.isnull().sum()
print(missing_values)

# Fill missing values with the mean (example)
df_filled = dataframe.fillna(dataframe.mean())
