This Python script asks user via GUI to upload a .CSV file, reads the data, and plots and prints the translated data.

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from tkinter import Tk, filedialog, Button, Text, Scrollbar, Frame
import matplotlib.cm as cm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster import hierarchy

def process_csv():
    file_path = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
    df = pd.read_csv(file_path, header=None)
    items_in_csv = df.iloc[:, 0].unique()
    timegap_dict, total_shoppers = calculate_timegap(df)
    
    pair_list = sorted(list(set(pair for pair in timegap_dict.keys())))
    item_list = sorted(list(set(item for pair in timegap_dict.keys() for item in pair)))

    k_optimal = silhoutte_score(timegap_dict)
    #k_optimal = scree_plot(timegap_dict)
    cluster_labels, linkage_matrix = perform_hierarchical_clustering(timegap_dict, num_clusters=k_optimal)
    plot_dendrogram(linkage_matrix)

    for widget in right_frame.winfo_children():
        widget.destroy()

    items_graph(timegap_dict, items_in_csv)
    clustered_graph_frame = Frame(right_frame)
    clustered_graph_frame.pack(side='left', padx=10, pady=10, fill='both', expand=True)
    clustered_items_graph(timegap_dict, items_in_csv, cluster_labels, clustered_graph_frame)

    total_unique_items = len(item_list)
    
    item_cluster_dict = {}
    for i, item in enumerate(item_list):
        item_cluster_dict[item] = cluster_labels[i] + 1  # Assign the cluster number (starting from 1)

    cluster_counts = {cluster: 0 for cluster in range(1, k_optimal + 1)}
    for item, cluster in item_cluster_dict.items():
        cluster_counts[cluster] += 1
    
    #for GUI
    item_list_text.config(state='normal')
    item_list_text.delete(1.0, 'end')
    item_list_text.insert('end', "List of Unique Items:\n")
    for i, item in enumerate(item_list):
        item_list_text.insert('end', f"{i + 1}. {item} - Cluster: {item_cluster_dict[item]}\n")
    item_list_text.config(state='disabled')

    timegap_text.config(state='normal')
    timegap_text.delete(1.0, 'end')
    timegap_text.insert('end', "Normalized Timegaps:\n")
    for i, (pair, timegaps) in enumerate(timegap_dict.items()):
        avg = sum(timegaps) / len(timegaps)
        item_x, item_x_plus_1 = pair
        timegap_text.insert('end', f"Pair {i + 1}: {avg:.2f} (Items: {item_x} -> {item_x_plus_1})\n")
    timegap_text.config(state='disabled')
    
    dataset_info_text.config(state='normal')
    dataset_info_text.delete(1.0, 'end')
    dataset_info_text.insert('end', f"Total Unique Items: {total_unique_items}\n\n")
    dataset_info_text.insert('end', f"Total Shoppers: {total_shoppers}\n\n")
    for cluster, count in cluster_counts.items():
        dataset_info_text.insert('end', f"Cluster {cluster} - Items: {count}\n")
    dataset_info_text.config(state='disabled')
    
    sorted_items = sorted(item_cluster_dict.items(), key=lambda x: x[1])
    clusters = list(set(cluster for item, cluster in sorted_items))
    clustered_data_text.config(state='normal')
    clustered_data_text.delete(1.0, 'end')
    clustered_data_text.insert('end', "Clustered Items:\n")
    for cluster in clusters:
        clustered_data_text.insert('end', f"Cluster {cluster}:\n")
        cluster_items = [item for item, c in sorted_items if c == cluster]
        for i, item in enumerate(cluster_items):
            clustered_data_text.insert('end', f"{i + 1}. {item} - Cluster: {item_cluster_dict[item]}\n")
        clustered_data_text.insert('end', "\n")
    clustered_data_text.config(state='disabled')

def calculate_timegap(df):
    data = defaultdict(list)
    current_list = None
    current_data = []
    total_shoppers = 0

    for index1 in range(len(df)):
        item_x = df.iloc[index1, 0]
        value_x = df.iloc[index1, 1]
        status_x = df.iloc[index1, 2]

        if (isinstance(status_x, (int, float)) or str(status_x).isdigit() or status_x == 'Good'):
            if value_x == 0:
                total_shoppers += 1
                current_list = item_x
                current_data = [(item_x, value_x)]
                data[current_list] = current_data
            else:
                current_data.append((item_x, value_x))
                data[current_list] = current_data

        distances = defaultdict(list)

        for key, item_list in data.items():
            for i in range(len(item_list)):
                for j in range(i + 1, len(item_list)):
                    item1_name, item1_data = item_list[i]
                    item2_name, item2_data = item_list[j]
                    distances_key = "<>".join(sorted([item1_name, item2_name]))

                    weight = 1 / (j - i + 1)

                    distance = abs(float(item1_data) - float(item2_data)) * weight
                    distances[distances_key].append((distance, i, j))

        threshold = 0.5  # Adjust this value as needed
        pruned_distances = {}

        for key, value in distances.items():
            if len(value) > 0:
                weighted_distances = [distance for distance, _, _ in value]
                max_weight = max([1 / (j - i + 1) for _, i, j in value])
                average_distance = sum(weighted_distances) / len(weighted_distances)

                if max_weight > 0 and average_distance / max_weight >= threshold:
                    pruned_distances[key] = average_distance

        sorted_pruned_distances = dict(sorted(pruned_distances.items()))
        timegap_dict = defaultdict(list)

        for key, value in sorted_pruned_distances.items():
            item1, item2 = key.split('<>')
            pair = (item1, item2)
            timegap_dict[pair] = [value]

    return timegap_dict, total_shoppers

def silhoutte_score(timegap_dict):
    max_clusters = min(len(timegap_dict), 100)
    max_iterations_without_improvement = 5
    current_score = -1 
    best_score = -1
    k_optimal = None

    k = 20
    silhouette_scores = []
    k_values = [] 
    
    while True:
        if k > max_clusters:
            break
        
        if best_score > current_score:
            max_iterations_without_improvement -= 1

        if max_iterations_without_improvement == 0:
            break

        _, linkage_matrix = perform_hierarchical_clustering(timegap_dict, k)
        cluster_labels = assign_clusters(linkage_matrix, k)
        
        timegaps = [timegap for timegaps in timegap_dict.values() for timegap in timegaps]
        
        silhouette_avg = silhouette_score(np.array(timegaps).reshape(-1, 1), cluster_labels)
        silhouette_scores.append(silhouette_avg)
        k_values.append(k)

        if silhouette_avg > best_score:
            best_score = silhouette_avg
            k_optimal = k
            max_iterations_without_improvement = 5

        current_score = silhouette_avg

        k += 1

    if k_optimal is not None:
        plt.figure(figsize=(8, 6))
        plt.plot(k_values, silhouette_scores, marker='o')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Silhouette Score')
        plt.title('Silhouette Scores for Different Numbers of Clusters')
        plt.grid()
        plt.show()

    return k_optimal

def scree_plot(timegap_dict):
    timestamps = []
    item_to_index = {}
    index = 0

    for pair, timegaps in timegap_dict.items():
        item_x, item_x_plus_1 = pair

        if item_x not in item_to_index:
            item_to_index[item_x] = index
            index += 1
        if item_x_plus_1 not in item_to_index:
            item_to_index[item_x_plus_1] = index
            index += 1

        timestamps.extend(timegaps)

    item_indices = [item_to_index[item] for pair in timegap_dict.keys() for item in pair]
    distances = np.array(timestamps).reshape(-1, 1)
    
    y_values = []
    for k in range(2, min(30, len(distances) + 1)):
        hierarchical = AgglomerativeClustering(n_clusters=k, linkage='ward', compute_distances=True).fit(distances)
        y_values.append(hierarchical.distances_)
    
    x = np.arange(2, min(30, len(distances) + 1)) 

    plt.figure(figsize=(8, 6))
    plt.title('Scree Plot')
    plt.xlabel('No. of Clusters (k)')
    plt.ylabel('Distance between clusters')
    plt.plot(x, [np.mean(y) for y in y_values], marker='o')

    curvature = np.diff(y_values, 2)
    k_optimal = np.argmax(curvature) + 2
    
    # Ensure k_optimal is within the range of available values
    if k_optimal >= 2 and k_optimal < len(y_values):
        plt.scatter(k_optimal, np.mean(y_values[k_optimal - 2]), c='red', label=f'Optimal k ({k_optimal})', marker='x')
    
    plt.legend()
    plt.show()
    
    return k_optimal


def perform_hierarchical_clustering(timegap_dict, num_clusters):
    timestamps = []
    item_to_index = {}
    index = 0

    for pair, timegaps in timegap_dict.items():
        item_x, item_x_plus_1 = pair

        if item_x not in item_to_index:
            item_to_index[item_x] = index
            index += 1
        if item_x_plus_1 not in item_to_index:
            item_to_index[item_x_plus_1] = index
            index += 1

        timestamps.extend(timegaps)

    item_indices = [item_to_index[item] for pair in timegap_dict.keys() for item in pair]

    distances = np.array(timestamps).reshape(-1, 1)
    
    hierarchical = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
    cluster_labels = hierarchical.fit_predict(distances)
    linkage_matrix = hierarchy.linkage(distances, 'ward')

    unique_item_clusters = [cluster_labels[item_indices[i]] for i in range(len(item_indices))]

    return cluster_labels, linkage_matrix

def assign_clusters(linkage_matrix, num_clusters):
    cluster_labels = hierarchy.cut_tree(linkage_matrix, n_clusters=num_clusters)
    return cluster_labels.reshape(-1)

def plot_dendrogram(linkage_matrix):
    hierarchy.dendrogram(linkage_matrix)

    plt.xlabel('Data points')
    plt.ylabel('Distance')
    plt.title('Hierarchical Clustering Dendrogram')
    plt.show()

def items_graph(timegap_dict, items_in_csv):
    G = nx.Graph()
    G.add_nodes_from(items_in_csv)

    edge_labels = {}
    for pair, timegaps in timegap_dict.items():
        item_x, item_x_plus_1 = pair
        avg = sum(timegaps) / len(timegaps)
        
        if avg != 0 or G.has_edge(item_x, item_x_plus_1):
            G.add_edge(item_x, item_x_plus_1, weight=avg)
            edge_labels[(item_x, item_x_plus_1)] = f'{avg:.2f}'

    isolated_nodes = [node for node in G.nodes() if G.degree[node] == 0]
    G.remove_nodes_from(isolated_nodes)

    seed = 42
    pos = nx.spring_layout(G, k=1.5, iterations=500, seed=seed)
    fig, ax = plt.subplots()

    nx.draw(G, pos, with_labels=False, node_color='red', node_size=6, font_size=1, width=0.1, alpha=1, ax=ax)
    #edge_labels = nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
    canvas = FigureCanvasTkAgg(fig, master=right_frame)
    canvas.draw()
    canvas.get_tk_widget().pack(side='top', fill='both', expand=1)

def clustered_items_graph(timegap_dict, items_in_csv, cluster_labels, frame):
    G = nx.Graph()
    G.add_nodes_from(items_in_csv)

    for pair, timegaps in timegap_dict.items():
        item_x, item_x_plus_1 = pair
        avg = sum(timegaps) / len(timegaps)
        
        if avg != 0 or G.has_edge(item_x, item_x_plus_1):
            G.add_edge(item_x, item_x_plus_1, weight=avg)

    isolated_nodes = [node for node in G.nodes() if G.degree[node] == 0]
    G.remove_nodes_from(isolated_nodes)

    seed = 42
    pos = nx.spring_layout(G, k=1.5, iterations=500, seed=seed)
    fig, ax = plt.subplots()

    node_to_cluster = dict(zip(items_in_csv, cluster_labels))
    colormap = cm.rainbow
    node_colors = [colormap(node_to_cluster[node] / max(cluster_labels)) for node in G.nodes()]
    
    nx.draw(G, pos, with_labels=False, node_color=node_colors, node_size=6, font_size=-1, width=0.1, alpha=1, ax=ax)
    canvas = FigureCanvasTkAgg(fig, master=frame)
    canvas.draw()
    canvas.get_tk_widget().pack(side='top', fill='both', expand=1)


# Create GUI
root = Tk()
root.title("Graph Visualization")

# Create a frame for the main layout
main_frame = Frame(root)
main_frame.pack(fill='both', expand=True)

# Create a frame for the left column (text frames)
left_frame = Frame(main_frame)
left_frame.pack(side='left', padx=10, pady=10)

# Create text widgets and scrollbars for the left column
item_list_text = Text(left_frame, height=15, width=80)
item_list_text.insert('end', "List of Items:\n")
item_list_text.config(state='disabled')
item_list_text.pack(side='top')
item_list_scrollbar = Scrollbar(left_frame, command=item_list_text.yview)
item_list_text['yscrollcommand'] = item_list_scrollbar.set

timegap_text = Text(left_frame, height=15, width=80)
timegap_text.insert('end', "Normalized Timegaps:\n")
timegap_text.config(state='disabled')
timegap_text.pack(side='top')
timegap_scrollbar = Scrollbar(left_frame, command=timegap_text.yview)
timegap_text['yscrollcommand'] = timegap_scrollbar.set

dataset_info_text = Text(left_frame, height=15, width=80)
dataset_info_text.insert('end', "Dataset Information:\n")
dataset_info_text.config(state='disabled')
dataset_info_text.pack(side='top')
dataset_info_scrollbar = Scrollbar(left_frame, command=dataset_info_text.yview)
dataset_info_text['yscrollcommand'] = dataset_info_scrollbar.set

clustered_data_text = Text(left_frame, height=15, width=80)
clustered_data_text.insert('end', "Clustered Data:\n")
clustered_data_text.config(state='disabled')
clustered_data_text.pack(side='top')
clustered_data_scrollbar = Scrollbar(left_frame, command=clustered_data_text.yview)
clustered_data_text['yscrollcommand'] = clustered_data_scrollbar.set

# Create a button for processing the CSV
process_button = Button(left_frame, text="Process CSV and Visualize Graph", command=process_csv)
plt.close('all')

# Pack all the widgets in the left column
timegap_text.pack(side='top', fill='both', expand=True)
#timegap_scrollbar.pack(side='right', fill='y')
dataset_info_text.pack(side='top', fill='both', expand=True)
#dataset_info_scrollbar.pack(side='right', fill='y')
clustered_data_text.pack(side='top', fill='both', expand=True)
#clustered_data_scrollbar.pack(side='right', fill='y')
process_button.pack(side='top', padx=10, pady=10)

# Create a frame for the right column (plots)
right_frame = Frame(main_frame)
right_frame.pack(side='right', padx=10, pady=10, fill='both', expand=True)

root.mainloop()


ModuleNotFoundError: No module named 'customtkinter'