In [7]:
import falkordb
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.decomposition import PCA

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import plotly.express as px

In [8]:
# silhouette score functions to optimize cluster nums

def sil_scores(X, n_cluster_start, n_cluster_end):
    scores = []
    for i in range(n_cluster_start, n_cluster_end+1):
        kmeans = KMeans(n_clusters = i, random_state=42).fit(X)
        cluster_labels = kmeans.predict(X)
        sil_score = silhouette_score(X, cluster_labels)
        scores.append((i, sil_score))
    return scores

def sil_scores_hist(X, n_cluster_start, n_cluster_end):
    scores = sil_scores(X, n_cluster_start, n_cluster_end)
    n_clusters = [score[0] for score in scores]
    scores = [score[1] for score in scores]
    # plt.figure(figsize=(10,6))
    plt.bar(n_clusters, scores)
    plt.grid(True, alpha=0.3)
    plt.show()

In [9]:
def sil_scores_graph(X, n_cluster_start, n_cluster_end):
    for n in range(n_cluster_start, n_cluster_end+1):
        fig, ax1 = plt.subplots(1, 1)
        ax1.set_xlim([-0.1, 1])
        ax1.set_ylim([0, len(X) + (n + 1) * 10])

        kmeans = KMeans(n_clusters=n, random_state=42)
        cluster_labels = kmeans.fit_predict(X)

        # average sil scores for all the samples
        silhouette_avg = silhouette_score(X, cluster_labels)
        print(n, silhouette_avg)

        # sil scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n):
            # aggregate the silhouette scores for samples belonging to cluster i, and sort them
            ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
            ith_cluster_silhouette_values.sort()
            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n)
            ax1.fill_betweenx(
                np.arange(y_lower, y_upper),
                0,
                ith_cluster_silhouette_values,
                facecolor=color,
                edgecolor=color,
                alpha=0.7,
                )
        
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10

        plt.title("n = " + str(n))
        plt.xlabel("Silhouette coefficient values")
        plt.ylabel("Cluster label")

        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

In [10]:
def scatter_2d(data, clusters, title=None):
    df = pd.DataFrame(data, columns=['Comp1', 'Comp2'])
    df['Cluster'] = clusters

    # Convert Cluster column to categorical type
    df['Cluster'] = df['Cluster'].astype('category')

    # plotly express
    fig = px.scatter(
        df,
        x='Comp1',
        y='Comp2',
        category_orders={'Cluster': sorted(df['Cluster'].unique())},
        color='Cluster',
        labels={'Comp1': 'Comp 1', 'Comp2': 'Comp 2'},
        opacity=0.6,
        range_x = [min(df['Comp1'] * 1.1), max(df['Comp1']) * 1.1],
        range_y = [min(df['Comp2']) * 1.1, max(df['Comp2']) * 1.1],
        title=title
    )

    fig.update_layout(height=600, width=600)
    fig.show() 


def scatter_3d(data, clusters, title=None):
    df = pd.DataFrame(data, columns=['Comp1', 'Comp2', 'Comp3'])
    df['Cluster'] = clusters

    # Convert Cluster column to categorical type
    df['Cluster'] = df['Cluster'].astype('category')

    # plotly express
    fig = px.scatter_3d(
        df,
        x='Comp1',
        y='Comp2',
        z='Comp3',
        category_orders={'Cluster': sorted(df['Cluster'].unique())},
        color='Cluster',
        labels={'Comp1': 'Comp 1', 'Comp2': 'Comp 2', 'Comp3': 'Comp 3'},
        opacity=0.5,
        range_x = [min(df['Comp1']) * 1.1, max(df['Comp1']) * 1.1],
        range_y = [min(df['Comp2']) * 1.1, max(df['Comp2']) * 1.1],
        range_z = [min(df['Comp3']) * 1.1, max(df['Comp3']) * 1.1],
        title=title
    )

    fig.update_layout(height=600, width=600)
    fig.show()

In [11]:
client = falkordb.FalkorDB(host="localhost", port=6379)
graph = client.select_graph("validation")

In [12]:
# query = "MATCH (n:NODE) RETURN n LIMIT 5"
# query = "MATCH (n:Concept) RETURN n LIMIT 5"
query = "MATCH (n:Intervention) RETURN n LIMIT 5"
res = graph.query(query)

In [13]:
res

<falkordb.query_result.QueryResult at 0x118a494d0>

In [7]:
for record in res.result_set:
    node = record[0]   # first column
    print("Node ID:", node.id)
    print("Labels:", node.labels)
    print("Properties:", node.properties)

Node ID: 7
Labels: ['NODE', 'Intervention']
Properties: {'name': 'compute stochastic mapping and compose utility', 'type': 'intervention', 'description': 'Before planning, compute a stochastic mapping between old and new ontologies via bisimulation-KL optimisation and replace the utility with U∘ϕ.', 'aliases': ['ϕ translation procedure', 'utility translation algorithm'], 'intervention_lifecycle': 1, 'intervention_maturity': 1, 'paper_id': 'arxiv__arxiv_org_abs_1105_3821', 'method': 'model', 'embedding': [0.013494699262082577, 0.0062162685208022594, -0.011954950168728828, 0.01679687388241291, -0.02935032919049263, -0.03013230301439762, -0.031016329303383827, -0.013136434368789196, -0.029717201367020607, 0.055267345160245895, 0.06521584093570709, -0.021993368864059448, 0.007668252103030682, -0.02454192563891411, -0.004267838783562183, 0.04021041840314865, -0.048543475568294525, -0.021146994084119797, -0.026684382930397987, 0.020356114953756332, 0.03916151821613312, 0.020731745287775993, 

In [8]:
def extract_embeds(result_set, embed_key="embedding"):
    """
    Returns df_nodes, df_embeds
    """
    rows = []
    ids = []
    embeddings = []

    for record in result_set:
        node = record[0]  # assuming first column is the node
        props = dict(node.properties)  # copy properties
        node_id = node.id
        ids.append(node_id)
        embeddings.append(props.pop(embed_key))  # remove embedding from properties
        props["id"] = node_id
        rows.append(props)

    # node info only
    df_nodes = pd.DataFrame(rows)

    # node id + expanded embeddings
    df_embeds = pd.DataFrame(
        embeddings,
        columns=[f"vec_{i}" for i in range(len(embeddings[0]))]
    )
    df_embeds.insert(0, "id", ids)

    return df_nodes, df_embeds

In [9]:
query = "MATCH (n:Concept) RETURN n"
res = graph.query(query)
df_nodes, df_embeds = extract_embeds(res.result_set)

In [10]:
# test run: kmeans
kmeans = KMeans(n_clusters=15, random_state=42).fit(df_embeds.drop(columns=["id"]))
clusters = kmeans.predict(df_embeds.drop(columns=["id"]))

df_clusters = df_nodes.copy()
df_clusters['cluster'] = clusters

In [11]:
# get unique combinations per cluster with counts
cluster_digest = df_clusters.groupby('cluster').agg({
    'name': lambda x: list(set(x)),
    'type': lambda x: list(set(x)), 
    'concept_category': lambda x: list(set(x)),
    'paper_id': lambda x: list(set(x)),
    'id': lambda x: list(set(x)), 
    'cluster': 'count'  # Count rows per cluster
}).rename(columns={'cluster': 'count'})

cluster_digest.to_csv('cluster_digest.csv')
cluster_digest.head(20)

Unnamed: 0_level_0,name,type,concept_category,paper_id,id,count
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,[active reinforcement learning model with quer...,[concept],"[Requirement, Finding, Opportunity, Risk, Outc...","[arxiv__arxiv_org_abs_1802_01744, arxiv__arxiv...","[647, 648, 649, 650, 651, 1034, 1035, 1036, 10...",47
1,[LSTM gating mechanisms enable learning long-r...,[concept],"[Model, Finding, Evidence, Opportunity, Risk, ...","[arxiv__arxiv_org_abs_1804_04241, arxiv__arxiv...","[3, 908, 912, 147, 148, 659, 662, 1171, 665, 6...",50
2,[fitness functions that imperfectly capture de...,[concept],"[Finding, Risk, Problem, Claim, Observation, T...","[arxiv__arxiv_org_abs_1804_03235, arxiv__arxiv...","[2, 261, 263, 264, 268, 396, 146, 791, 544, 16...",56
3,"[Definition of counterfactual fairness, Missin...",[concept],"[Model, Finding, Evidence, Opportunity, Risk, ...","[arxiv__arxiv_org_abs_1308_3778, arxiv__arxiv_...","[15, 527, 529, 530, 531, 21, 1050, 1051, 1054,...",78
4,[extreme verification burden for learning agen...,[concept],"[Requirement, Finding, Evidence, State, Risk, ...","[arxiv__arxiv_org_abs_1804_03235, arxiv__arxiv...","[1, 1153, 1158, 11, 14, 398, 17, 23, 24, 25, 2...",58
5,[delay-compensation parameter d for aligning f...,[concept],"[Model, Finding, Opportunity, Method, Problem,...","[arxiv__arxiv_org_abs_1707_05173, arxiv__arxiv...","[769, 764, 772, 901, 394, 783, 400, 401, 1046,...",35
6,"[Full source coverage during decoding, natural...",[concept],"[Finding, Opportunity, Method, Problem, Claim,...","[arxiv__arxiv_org_abs_1308_3778, arxiv__arxiv_...","[257, 770, 771, 1155, 519, 140, 142, 143, 663,...",57
7,[intractable posterior distributions in contin...,[concept],"[Finding, Evidence, Opportunity, Risk, Method,...","[arxiv__arxiv_org_abs_1308_3778, arxiv__arxiv_...","[636, 386, 4, 133, 6, 904, 905, 10, 906, 12, 9...",64
8,[improved representation of rare demographic t...,[concept],"[Requirement, Finding, Evidence, Opportunity, ...","[arxiv__arxiv_org_abs_1804_04241, arxiv__arxiv...","[514, 515, 516, 5, 518, 1027, 520, 1028, 522, ...",56
9,"[forecast calibration uncertainty, lack of fea...",[concept],"[Requirement, Finding, Evidence, Opportunity, ...","[arxiv__arxiv_org_abs_1804_03235, arxiv__arxiv...","[391, 524, 525, 526, 652, 528, 653, 782, 20, 5...",55


In [12]:
pca = PCA(n_components=3)
reduced_data = pca.fit_transform(df_embeds.drop(columns=["id"]))
scatter_3d(reduced_data, clusters)

In [13]:
# UMAP n_neighbors (d=15), min_dist (d=0.1), metric (d=euclidean)
umap_3d = umap.UMAP(n_components=3, n_neighbors=20, min_dist=0.25)
reduced_data = umap_3d.fit_transform(df_embeds.drop(columns=["id"]))
scatter_3d(reduced_data, clusters)

In [14]:
# parameter search
range_pca = [5,30]
range_clusters = [2,5]

results = []
for n_pca in range(range_pca[0],range_pca[1]):
    df_train_pca = PCA(n_components=n_pca).fit_transform(df_embeds.drop(columns=["id"]))
    scores = sil_scores(df_train_pca,range_clusters[0],range_clusters[1])
    
    for score in scores:
        results.append((n_pca, score[0], round(float(score[1]), 3)))

df_results = pd.DataFrame(results, columns=['n_pca', 'n_clusters', 'sil_score'])
df_results.sort_values(by=['sil_score'], ascending=False)

Unnamed: 0,n_pca,n_clusters,sil_score
0,5,2,0.636
4,6,2,0.606
8,7,2,0.581
12,8,2,0.561
16,9,2,0.544
...,...,...,...
75,23,5,0.119
87,26,5,0.114
83,25,5,0.113
95,28,5,0.109


In [15]:
# test run: kmeans
df_train_pca = PCA(n_components=5).fit_transform(df_embeds.drop(columns=["id"]))
kmeans = KMeans(n_clusters=2, random_state=42).fit(df_train_pca)
clusters = kmeans.predict(df_train_pca)

df_clusters = df_nodes.copy()
df_clusters['cluster'] = clusters

In [16]:
# get unique combinations per cluster with counts
cluster_digest = df_clusters.groupby('cluster').agg({
    'name': lambda x: list(set(x)),
    'type': lambda x: list(set(x)), 
    'concept_category': lambda x: list(set(x)),
    'paper_id': lambda x: list(set(x)),
    'id': lambda x: list(set(x)), 
    'cluster': 'count'  # Count rows per cluster
}).rename(columns={'cluster': 'count'})

cluster_digest.to_csv('cluster_digest.csv')
cluster_digest.head(20)

Unnamed: 0_level_0,name,type,concept_category,paper_id,id,count
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,[active reinforcement learning model with quer...,[concept],"[Requirement, Finding, Opportunity, Risk, Outc...","[arxiv__arxiv_org_abs_1802_01744, arxiv__arxiv...","[647, 648, 649, 650, 651, 1034, 1035, 1036, 10...",47
1,[improved representation of rare demographic t...,[concept],"[Model, Opportunity, Claim, Constraint, Observ...","[arxiv__arxiv_org_abs_1308_3778, arxiv__arxiv_...","[0, 1, 2, 3, 4, 5, 6, 10, 11, 12, 14, 15, 17, ...",787
