In [5]:
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import json

# Step 1: Load JSON Data
with open('sample_data.json', 'r') as file:
    data = json.load(file)

# Extract content and perform TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([item['content'] for item in data])

# Clustering
num_clusters = 3  # Adjust based on your data
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

# Dimensionality Reduction for 3D visualization
pca = PCA(n_components=3)
coords = pca.fit_transform(tfidf_matrix.toarray())

# Create a 3D scatter plot
fig = px.scatter_3d(
    x=coords[:, 0], y=coords[:, 1], z=coords[:, 2],
    color=clusters,
    hover_name=[item['name'] for item in data],
    title="3D Clustering Plot"
)

fig.show()

In [5]:
import json
from graphviz import Digraph

# Sample JSON data
data = {
    "List of files": {
        "Narrative 1": {"Schema1": 10, "Schema2": 10},
        "Narrative 2": {"Schema1": 10, "Schema2": 20},
        "Narrative 3": {"Schema1": 30, "Schema2": 10},
    }
}

def create_flow_diagram(data):
    dot = Digraph(comment='File Hierarchy', format='png')
    dot.attr(rankdir='LR')  # Left to Right orientation
    dot.attr('node', shape='box', style='filled, rounded', color='lightblue2', fontname='Helvetica', fontsize='12')
    dot.attr('edge', arrowhead='none', color='grey', fontname='Helvetica', fontsize='10')

    # Function to create a unique ID for each connection node
    def create_connection_id(from_node, to_node):
        return f"connection_{from_node}_{to_node}"

    # Iterate over the data to add nodes and edges
    for key, narratives in data.items():
        dot.node(key, key, color='lightgrey')
        for narrative, schemas in narratives.items():
            total_count = sum(schemas.values())
            connection_id = create_connection_id(key, narrative)
            dot.node(connection_id, str(total_count), shape='rectangle', color='white', fontcolor='black')
            dot.edge(key, connection_id, penwidth=str(max(1, total_count / 10)))
            dot.edge(connection_id, narrative, penwidth=str(max(1, total_count / 10)))
            for schema, count in schemas.items():
                schema_connection_id = create_connection_id(narrative, schema)
                dot.node(schema_connection_id, str(count), shape='rectangle', color='white', fontcolor='black')
                dot.edge(narrative, schema_connection_id, penwidth=str(max(1, count / 10)))
                dot.edge(schema_connection_id, schema, penwidth=str(max(1, count / 10)))

    dot.render('file_hierarchy', view=True)

create_flow_diagram(data)