# Kaggle Competition Agentic Orchestration System Demo

This notebook demonstrates the construction and testing of an agentic orchestration system for Kaggle competition assistance, leveraging LangGraph, CrewAI, and AutoGen. The workflow includes user query preprocessing, agent definitions, orchestration logic, and end-to-end testing.

## 1. Install and Import Required Libraries

Install all necessary packages and import them for use throughout the notebook.

In [None]:
# Install core libraries
%pip install langchain langgraph crewai autogen farm-haystack[all] sentence-transformers nltk spellchecker

# Optional: for visualization and debugging
%pip install graphviz

# Imports
import os
import json
import threading
from typing import List, Dict, Any, Optional, TypedDict
from datetime import datetime

# LangChain, LangGraph, CrewAI, AutoGen
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langgraph.graph import StateGraph, START, END
from langgraph.checkpoint import MemoryCheckpointer
from crewai import Crew, Agent, Task, Process
import autogen
from autogen import GroupChat, GroupChatManager, UserProxyAgent, AssistantAgent

# NLP and preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from sentence_transformers import SentenceTransformer, util

# Haystack for RAG
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import PreProcessor, EmbeddingRetriever, SentenceTransformersRanker

# Visualization
import graphviz

## 2. Define User Query Preprocessing Functions

Implement functions for cleaning, tokenizing, and extracting metadata from user queries, including spellchecking and intent detection.

In [None]:
import re

def basic_preprocess(query: str, remove_stopwords: bool = False) -> List[str]:
    cleaned_query = re.sub(r'[^\w\s]', '', query.lower())
    tokens = word_tokenize(cleaned_query)
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
    return tokens

def preprocess_kaggle_query(query: str, remove_stopwords: bool = False, spellcheck: bool = False) -> Dict[str, Any]:
    tokens = basic_preprocess(query, remove_stopwords=remove_stopwords)
    if spellcheck:
        try:
            spell = SpellChecker()
            tokens = [spell.correction(word) for word in tokens]
        except Exception:
            pass
    contains_code = bool(re.search(r'```|import |def |class |\=', query))
    contains_url = bool(re.search(r'http[s]?://', query))
    contains_number = any(char.isdigit() for char in query)
    contains_question = "?" in query
    length = len(tokens)
    return {
        "cleaned_query": " ".join(tokens),
        "original_query": query,
        "contains_code": contains_code,
        "contains_url": contains_url,
        "contains_number": contains_number,
        "contains_question": contains_question,
        "tokens": tokens,
        "length": length,
    }

# Example intent detection (simple embedding-based)
class SimpleIntentDetector:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.intent_map = {
            "onboarding": [
                "just joined this competition",
                "first time",
                "beginner",
                "getting started"
            ],
            "progress": [
                "started exploring",
                "here is my code",
                "i tried",
                "my current approach"
            ],
            "confusion": [
                "not sure",
                "don’t understand",
                "explain this code"
            ]
        }
        self.intent_embeddings = {
            intent: self.model.encode(phrases, convert_to_tensor=True)
            for intent, phrases in self.intent_map.items()
        }
        self.threshold = 0.65

    def classify(self, cleaned_input: str):
        input_embedding = self.model.encode(cleaned_input, convert_to_tensor=True)
        best_intent = None
        best_score = -1
        for intent, pattern_embeddings in self.intent_embeddings.items():
            scores = util.pytorch_cos_sim(input_embedding, pattern_embeddings)
            max_score = scores.max().item()
            if max_score > best_score:
                best_score = max_score
                best_intent = intent
        return best_intent, best_score

## 3. Sample User Queries and Preprocessing Tests

Create a list of sample user queries and demonstrate the preprocessing pipeline by printing tokenized and cleaned outputs.

In [None]:
sample_user_queries = [
    "Explain the evaluation metric used for this Kaggle competition",
    "What are the most efficient baseline model options given the structure of the data and the size of the dataset?",
    "Given my baseline model, what are the next steps I should take to iteratively improve my model and have a submission within two or three days?",
    "Where can I find the leaderboard?",
    "Can you summarize the competition timeline?",
    "Here is a screenshot of the discussion — what does it say?"
]

intent_detector = SimpleIntentDetector()

for i, query in enumerate(sample_user_queries):
    preprocessed = preprocess_kaggle_query(query, remove_stopwords=True, spellcheck=True)
    intent, score = intent_detector.classify(preprocessed["cleaned_query"])
    print(f"Query {i+1}: {query}")
    print(f"  Tokens: {preprocessed['tokens']}")
    print(f"  Cleaned: {preprocessed['cleaned_query']}")
    print(f"  Detected intent: {intent} (score: {score:.2f})")
    print("-" * 60)

## 4. Agent and Orchestrator Class Definitions

Define all agent classes (e.g., CompetitionSummaryAgent, NotebookExplainerAgent, etc.) and the orchestrator classes for routing and aggregation.

In [None]:
# Example: CompetitionSummaryAgent
class CompetitionSummaryAgent:
    def __init__(self, llm=None, competition_context=None, haystack_rag_pipeline=None):
        self.llm = llm or ChatOpenAI(temperature=0.3, model_name="gpt-4")
        self.competition_context = competition_context or {
            "overview": "This is a sample Kaggle competition.",
            "data": "Sample data description.",
            "evaluation": "Evaluation metric: RMSE.",
            "timeline": "Competition runs from Jan to March."
        }
        self.haystack_rag_pipeline = haystack_rag_pipeline
        self.name = "CompetitionSummaryAgent"
        self.prompt_template = PromptTemplate(
            input_variables=["query", "overview", "data", "evaluation", "timeline"],
            template="""
You're an expert Kaggle competition assistant. The user asked:
"{query}"

Based on the competition details below, generate a helpful summary that aligns with their intent.

--- OVERVIEW ---
{overview}

--- DATA ---
{data}

--- EVALUATION ---
{evaluation}

--- TIMELINE ---
{timeline}

Return a concise yet informative response tailored to the query.
"""
        )
        self.chain = LLMChain(llm=self.llm, prompt=self.prompt_template)

    def run(self, structured_query: Dict[str, Any]) -> Dict[str, Any]:
        query = structured_query.get("cleaned_query", "")
        context = self.competition_context
        inputs = {
            "query": query,
            **context
        }
        try:
            response = self.chain.run(inputs)
        except Exception as e:
            response = f"[Error in {self.name}] {str(e)}"
        return {
            "agent_name": self.name,
            "response": response
        }

# Define other agents similarly (NotebookExplainerAgent, CodeFeedbackAgent, etc.)
# For brevity, only CompetitionSummaryAgent is shown here.

## 5. LangGraph Workflow Construction

Build the LangGraph state graph, add nodes for preprocessing, routing, agent execution, aggregation, and define conditional edges based on intent.

In [None]:
# Define state and nodes for LangGraph
class OrchestratorState(TypedDict, total=False):
    original_query: str
    structured_query: Dict[str, Any]
    cleaned_query: str
    tokens: List[str]
    metadata: Dict[str, Any]
    intent: Optional[str]
    agent_outputs: List[Dict[str, Any]]
    final_response: Optional[str]

memory = MemoryCheckpointer()
competition_summary_agent = CompetitionSummaryAgent()

def preprocessing_node(state: OrchestratorState) -> OrchestratorState:
    original_query = state.get("original_query", "")
    processed = preprocess_kaggle_query(original_query)
    state["cleaned_query"] = processed["cleaned_query"]
    state["tokens"] = processed.get("tokens", [])
    state["metadata"] = processed
    return state

def router_node(state: OrchestratorState) -> OrchestratorState:
    # For demo, route all to competition_summary
    state["intent"] = "overview"
    state["structured_query"] = {
        "cleaned_query": state.get("cleaned_query", ""),
        "metadata": state.get("metadata", {})
    }
    return state

def competition_summary_node(state: OrchestratorState) -> OrchestratorState:
    response = competition_summary_agent.run(state["structured_query"])
    state.setdefault("agent_outputs", []).append(response)
    return state

def aggregation_node(state: OrchestratorState) -> OrchestratorState:
    responses = state.get("agent_outputs", [])
    if not responses:
        state["final_response"] = "No relevant agent responses to aggregate."
        return state
    # For demo, just return the first agent's response
    state["final_response"] = responses[0]["response"]
    return state

# Build the graph
baseGraph = StateGraph(OrchestratorState)
baseGraph.add_node("preprocessing", preprocessing_node)
baseGraph.add_node("router", router_node)
baseGraph.add_node("competition_summary", competition_summary_node)
baseGraph.add_node("aggregation", aggregation_node)
baseGraph.add_edge(START, "preprocessing")
baseGraph.add_edge("preprocessing", "router")
baseGraph.add_edge("router", "competition_summary")
baseGraph.add_edge("competition_summary", "aggregation")
baseGraph.add_edge("aggregation", END)
compiled_graph = baseGraph.compile(checkpointer=memory)

## 6. CrewAI and AutoGen Orchestrator Integration

Set up CrewAI and AutoGen orchestrators, define group chats, and implement parallel/concurrent execution logic for multi-agent reasoning.

In [None]:
# For demonstration, we show a minimal CrewAI orchestrator setup
class MultiAgentReasoningOrchestrator:
    def __init__(self):
        self.agents = [competition_summary_agent]  # Add more agents as needed

    def run(self, user_query: str, metadata: Optional[Dict[str, Any]] = None) -> str:
        # For demo, just run the first agent
        return self.agents[0].run({"cleaned_query": user_query})

# Minimal AutoGen orchestrator stub
class AutoGenReasoningOrchestrator:
    def __init__(self):
        pass

    def run(self, user_query: str, metadata: Optional[Dict[str, Any]] = None) -> str:
        # For demo, just echo the query
        return {"agent_name": "AutoGenStub", "response": f"AutoGen would process: {user_query}"}

## 7. Dispatching and Routing Logic

Implement logic for selecting agents and execution backends (LangGraph, CrewAI, AutoGen) based on query structure, intent, and memory heuristics.

In [None]:
def dispatch_query(structured_query: Dict[str, Any], memory: Dict[str, Any] = None) -> Dict[str, Any]:
    # For demo, always select LangGraph and CompetitionSummaryAgent
    return {
        "selected_agents": ["CompetitionSummaryAgent"],
        "selected_backend": "LangGraph",
        "dispatch_reason": "Demo: always use LangGraph for overview intent"
    }

## 8. Running and Visualizing the Orchestration Graph

Demonstrate how to invoke the compiled LangGraph workflow, visualize the graph, and show intermediate steps for debugging.

In [None]:
# Visualize the graph (requires graphviz)
compiled_graph.visualize(path="agentic_orchestration_graph.png")

# Run a sample query through the workflow
user_query = "Can you summarize the competition timeline and evaluation metric?"
initial_state = {"original_query": user_query}
final_state = compiled_graph.invoke(initial_state, return_intermediate_steps=True)

print("Final Response:")
print(final_state.get("final_response", "[No response generated]"))

print("\nIntermediate Steps:")
for step in final_state.get("intermediate_steps", []):
    print(step)

## 9. Testing the Full Agentic System with Example Queries

Run end-to-end tests with example user queries, print the final responses, and optionally display reasoning traces or agent outputs.

In [None]:
test_queries = [
    "What is the main goal of this competition?",
    "How is the leaderboard calculated?",
    "What are the deadlines for submissions?"
]

for i, q in enumerate(test_queries):
    print(f"\n--- Test Query {i+1} ---")
    state = {"original_query": q}
    result = compiled_graph.invoke(state)
    print("Response:", result.get("final_response", "[No response generated]"))