<a href="https://colab.research.google.com/github/MobinMithun/Claude-Project-to-n8n-Workflow/blob/main/n8n_json_parser_packages_public.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parse the Unique JSONs

In [None]:
import os
import json
import glob
import time
import hashlib

# For running in Colab and mounting Google Drive
from google.colab import drive

# 1. MOUNT GOOGLE DRIVE
drive.mount('/content/drive')

# --------------------------------------------------------------------
# 2. CONFIGURE PATHS
# --------------------------------------------------------------------
source_folder_path = "/content/drive/MyDrive/n8n Workflows"
output_folder_path = "/content/drive/MyDrive/n8n_Workflow_Extracts"
os.makedirs(output_folder_path, exist_ok=True)

mega_text_path = os.path.join(output_folder_path, "ALL_unique_nodes.txt")
mega_json_path = os.path.join(output_folder_path, "ALL_unique_nodes.json")

all_files = glob.glob(os.path.join(source_folder_path, "*"))

print("=====================================================")
print("Looking for workflow files in:", source_folder_path)
print("=====================================================\n")
time.sleep(1)

if not all_files:
    print("No files found at all! Double-check your folder path.")
else:
    print(f"Found {len(all_files)} files total.\n")

# --------------------------------------------------------------------
# 3. HOW WE DEFINE "UNIQUENESS"
# --------------------------------------------------------------------
def node_signature(node):
    """
    Hash the entire node JSON, ensuring that any difference
    in structure or parameters yields a separate entry.
    """
    node_str = json.dumps(node, sort_keys=True)
    return hashlib.md5(node_str.encode("utf-8")).hexdigest()

# --------------------------------------------------------------------
# 4. DATA STRUCTURES
# --------------------------------------------------------------------
found_nodes = set()    # set of node signatures
unique_node_data = []  # list of all unique node dicts

# --------------------------------------------------------------------
# 5. PARSE EACH FILE & EXTRACT UNIQUE NODES
# --------------------------------------------------------------------
valid_file_count = 0
for file_path in all_files:
    extension = os.path.splitext(file_path)[1].lower()
    if extension not in [".json", ".txt"]:
        print(f"Skipping (not .json or .txt): {os.path.basename(file_path)}")
        continue

    file_name = os.path.basename(file_path)
    print(f"Attempting to parse '{file_name}'...")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            raw_content = f.read().strip()
            data = json.loads(raw_content)  # parse as JSON
    except Exception as e:
        print(f"  -> Skipping file (parse error): {file_name}\n     Error: {e}")
        continue

    valid_file_count += 1

    if not isinstance(data, dict):
        print(f"  -> Skipping (parsed JSON not a dict): {file_name}")
        continue

    nodes = data.get("nodes", [])
    if not nodes:
        print(f"  -> Skipping (no 'nodes' key or empty) in: {file_name}")
        continue

    print(f"  -> '{file_name}' has {len(nodes)} node(s). Extracting unique signatures...")

    for node in nodes:
        sig = node_signature(node)
        if sig not in found_nodes:
            found_nodes.add(sig)
            unique_node_data.append(node)

# --------------------------------------------------------------------
# 6. WRITE OUT THE UNIQUE NODES (FULL REFERENCE)
# --------------------------------------------------------------------
if not unique_node_data:
    print("\nNo unique nodes were found.")
else:
    print(f"\nTotal unique node definitions found: {len(unique_node_data)}\n")

    # 6A. Big text file
    with open(mega_text_path, "w", encoding="utf-8") as mega_txt:
        for i, node in enumerate(unique_node_data, 1):
            mega_txt.write(f"================ Node #{i} ================\n")
            mega_txt.write(json.dumps(node, indent=2))
            mega_txt.write("\n\n")

    # 6B. Big JSON array
    with open(mega_json_path, "w", encoding="utf-8") as mega_json:
        json.dump(unique_node_data, mega_json, indent=2)

    print("Wrote all unique nodes to:")
    print(f" - {mega_text_path}")
    print(f" - {mega_json_path}")

print("=====================================================")
print(f"Processing complete!")
print(f" - Valid text/JSON files processed: {valid_file_count}")
print(f" - Unique node definitions found: {len(unique_node_data)}")
print(f" - Output folder: {output_folder_path}")
print("=====================================================")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Looking for workflow files in: /content/drive/MyDrive/n8n Workflows

Found 294 files total.

Attempting to parse '🤖🧠 AI Agent Chatbot + LONG TERM Memory + Note Storage + Telegram.txt'...
  -> '🤖🧠 AI Agent Chatbot + LONG TERM Memory + Note Storage + Telegram.txt' has 21 node(s). Extracting unique signatures...
Attempting to parse 'Open Deep Research - AI-Powered Autonomous Research Workflow.txt'...
  -> 'Open Deep Research - AI-Powered Autonomous Research Workflow.txt' has 17 node(s). Extracting unique signatures...
Attempting to parse '🐋🤖 DeepSeek AI Agent + Telegram + LONG TERM Memory 🧠.txt'...
  -> '🐋🤖 DeepSeek AI Agent + Telegram + LONG TERM Memory 🧠.txt' has 23 node(s). Extracting unique signatures...
Attempting to parse 'Chat with Postgresql Database.txt'...
  -> 'Chat with Postgresql Database.txt' has 11 node(s). Extracting unique signatures...
Attempti

# Have an LLM Make a Cheat Sheet

In [None]:
!pip install openai

import os
import json
import math

# For running in Colab (uncomment if not already done):
# from google.colab import drive
# drive.mount('/content/drive')

os.environ["OPENAI_API_KEY"] = "ENTER_API_KEY"

from openai import OpenAI
client = OpenAI()  # uses OPENAI_API_KEY from environment

# Where the big JSON file from Phase 1 lives
mega_json_path = "/content/drive/MyDrive/n8n_Workflow_Extracts/ALL_unique_nodes.json"

# We'll write the summarized cheat sheet here
cheatsheet_path = "/content/drive/MyDrive/n8n_Workflow_Extracts/ALL_unique_nodes_cheatsheet.txt"

# ---------------------------
# Step 1: Read the big JSON
# ---------------------------
with open(mega_json_path, "r", encoding="utf-8") as f:
    all_nodes = json.load(f)

print(f"Loaded {len(all_nodes)} total nodes from {mega_json_path}.")

# ---------------------------
# Step 2: Group by node["type"]
#     (Customize if you prefer resource-based grouping.)
# ---------------------------
from collections import defaultdict

nodes_by_type = defaultdict(list)
for nd in all_nodes:
    ntype = nd.get("type", "unknown_type")
    nodes_by_type[ntype].append(nd)

print(f"Found {len(nodes_by_type)} unique node types.\n")

# ---------------------------
# Step 3: Summarize each group
#   We'll do chunking if too large for a single call
#   We'll then merge the partial summaries.
# ---------------------------

def summarize_nodes_with_o3mini(node_list):
    """
    Summarize a subset of nodes with the o3-mini model.
    Build a prompt that asks for a concise cheat sheet.
    """
    # We'll convert them to JSON for context
    # but keep it short if we can.
    data_str = json.dumps(node_list, indent=2)

    prompt_text = f"""
You are an expert in n8n node JSON definitions.
Below is a subset of nodes (in JSON).
Please produce a short "cheat sheet" that explains:
- The general structure for these node(s)
- Key fields or parameters
- Notable differences among them
- A short example or best-practices tip.

Nodes:
{data_str}

Cheat sheet:
"""

    response = client.chat.completions.create(
        model="o3-mini",
        reasoning_effort="medium",
        messages=[{"role": "user", "content": prompt_text}]
    )

    return response.choices[0].message.content.strip()

# Helper to chunk a group of nodes so we don't exceed context limits
def chunk_list(lst, chunk_size):
    """Yield successive chunks of size chunk_size from lst."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

# We'll define a rough chunk_size in # of nodes.
# Adjust if you need smaller to avoid token overload.
# If each node is large, reduce chunk_size.
CHUNK_SIZE = 10

all_summaries = []

for ntype, nodelist in nodes_by_type.items():
    print(f"Summarizing node type: {ntype} (count={len(nodelist)})")

    # We might have to chunk if too many nodes
    chunked_summaries = []
    for chunk_idx, chunk in enumerate(chunk_list(nodelist, CHUNK_SIZE), start=1):
        print(f"  - Summarizing chunk #{chunk_idx} with {len(chunk)} nodes...")
        partial_summary = summarize_nodes_with_o3mini(chunk)
        chunked_summaries.append(partial_summary)

    # Combine the partial summaries for this type
    combined_summary = f"CHEAT SHEET FOR NODE TYPE: {ntype}\n\n"
    combined_summary += "\n\n".join(chunked_summaries)
    combined_summary += "\n\n" + ("="*60) + "\n\n"

    all_summaries.append(combined_summary)

# ---------------------------
# Step 4: Write final cheat sheet
# ---------------------------
with open(cheatsheet_path, "w", encoding="utf-8") as csf:
    csf.write("\n".join(all_summaries))

print(f"\nDone! Wrote cheat sheet to: {cheatsheet_path}")


Loaded 5434 total nodes from /content/drive/MyDrive/n8n_Workflow_Extracts/ALL_unique_nodes.json.
Found 195 unique node types.

Summarizing node type: @n8n/n8n-nodes-langchain.chatTrigger (count=52)
  - Summarizing chunk #1 with 10 nodes...
  - Summarizing chunk #2 with 10 nodes...
  - Summarizing chunk #3 with 10 nodes...
  - Summarizing chunk #4 with 10 nodes...
  - Summarizing chunk #5 with 10 nodes...
  - Summarizing chunk #6 with 2 nodes...
Summarizing node type: n8n-nodes-base.stickyNote (count=1529)
  - Summarizing chunk #1 with 10 nodes...
  - Summarizing chunk #2 with 10 nodes...
  - Summarizing chunk #3 with 10 nodes...
  - Summarizing chunk #4 with 10 nodes...
  - Summarizing chunk #5 with 10 nodes...
  - Summarizing chunk #6 with 10 nodes...
  - Summarizing chunk #7 with 10 nodes...
  - Summarizing chunk #8 with 10 nodes...
  - Summarizing chunk #9 with 10 nodes...
  - Summarizing chunk #10 with 10 nodes...
  - Summarizing chunk #11 with 10 nodes...
  - Summarizing chunk #12