<a href="https://colab.research.google.com/github/Harshal292004/KGPDSH/blob/master/stormintegration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from langchain.prompts.chat import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pydantic import BaseModel, Field, validator, PrivateAttr
from typing import List, Dict
from google.colab import userdata
from langchain.agents import initialize_agent, AgentType
from langchain.chains import LLMChain
import re, json
import uuid

class ScoreDetails(BaseModel):
    score: float = Field(description="Score of the aspect (0-10)")
    justification: str = Field(description="Explanation for the score")

    @validator('score')
    def validate_score(cls, v: float) -> float:
        if v < 0 or v > 10:
            raise ValueError('Score must be between 0 and 10')
        return round(v, 1)

# Evaluation Categories for Publishability Assessment
class MethodologyEvaluation(BaseModel):
    research_design: ScoreDetails = Field(description="Appropriateness and quality of research design")
    reproducibility: ScoreDetails = Field(description="Clarity and completeness of methods for reproduction")

class CoherenceEvaluation(BaseModel):
    logical_flow: ScoreDetails = Field(description="Logical flow and structure of the paper")
    clarity: ScoreDetails = Field(description="Clarity and effectiveness of writing")

class ValidityEvaluation(BaseModel):
    evidence_strength: ScoreDetails = Field(description="Strength and relevance of evidence provided")
    validation: ScoreDetails = Field(description="Validation of claims and findings")

class PublishabilityEvaluation(BaseModel):
    methodology: MethodologyEvaluation = Field(description="Detailed methodology assessment")
    coherence: CoherenceEvaluation = Field(description="Coherence and clarity assessment")
    validity: ValidityEvaluation = Field(description="Validity of claims and evidence")

class PaperEvaluation(BaseModel):
    paper_id: str = Field(description="Unique identifier for the paper")
    publishable: bool = Field(description="Whether the paper is deemed publishable")
    scores: PublishabilityEvaluation = Field(description="Scores for publishability evaluation")
    justification: str = Field(description="Overall justification for publishability decision")
    summary_of_paper: str = Field(description="Summary of the paper for retrieval and further analysis")

# Implementing STORM Architecture for Conference Selection
class ConferenceAgent:
    def __init__(self, name: str, chat_model: ChatGroq, target_conference: str , conference_themes: str, conference_context:str):
        self.name = name
        self.chat_model = chat_model
        self.target_conference = target_conference
        self.conference_themes = conference_themes
        self.conference_context = conference_context

    def evaluate_paper(self, paper_summary: str ) -> Dict:
        prompt = ChatPromptTemplate.from_messages([
            ('system',  f"""
            You are a representative for the conference {self.target_conference}. Strictly Focus Specially on the conference themes: {self.conference_themes}
            Use the following additional context to guide your evaluation:
            {self.conference_context}

            Evaluate the research paper based on:
            - Relevance to conference themes
            - Quality of methodology
            - Novelty of contribution
            Provide a score (0-10) and a justification for why the paper fits this conference.
            Respond is json format with the following keys
            - score: float
            - justification: str
            """),
            ('user', "Paper summary: {paper_summary}")
        ])
        chain = LLMChain(llm = self.chat_model, prompt=prompt)
        response = chain.run({"paper_summary": paper_summary})
        try:
            # Preprocess the response to clean up invalid characters
            cleaned_response = re.sub(r'[\x00-\x1F\x7F]', '', response)  # Remove control characters
            json_match = re.search(r'\{.*\}', cleaned_response, re.DOTALL)
            if json_match:
                json_response = json_match.group(0)
                response_dict = json.loads(json_response)
                return {
                    "score": float(response_dict["score"]),
                    "justification": response_dict["justification"],
                }
            else:
                raise ValueError(f"No JSON object found in the response: {response}")
        except json.JSONDecodeError as e:
            raise ValueError(f"Failed to parse response as JSON: {response}") from e

class STORMSystem:
    def __init__(self, agents: List[ConferenceAgent]):
        self.agents = agents

    def discuss_and_decide(self, paper_summary: str) -> Dict:
        evaluations = []
        for agent in self.agents:
            evaluation = agent.evaluate_paper(paper_summary)
            evaluations.append({
                "conference": agent.target_conference,
                "score": evaluation["score"],
                "justification": evaluation["justification"],
            })

        # Determine the best conference based on scores
        best_conference = max(evaluations, key=lambda x: x["score"])
        return {
            "best_conference": best_conference["conference"],
            "justification": best_conference["justification"],
        }

# Integrating Publishability Assessment
import re
import uuid
import json

class PublishabilityEvaluator:
    def __init__(self, chat_model: ChatGroq):
        self.chat_model = chat_model

    def evaluate_publishability(self, paper_text: str) -> dict:
            prompt = ChatPromptTemplate.from_messages([
                ("system", """
                You are an expert evaluator tasked with determining if a research paper is publishable.
                Evaluate the following criteria and provide scores (0-10) with justifications:
                - Methodology: Research design and reproducibility
                - Coherence: Logical flow and clarity
                - Validity: Evidence strength and validation of claims

                Based on the scores, decide if the paper is publishable and provide an overall justification.
                """),
                ("user", "Paper Text: {paper_text}")
            ])

            chain = LLMChain(llm=self.chat_model, prompt=prompt)
            response = chain.run({"paper_text": paper_text})
            cleaned_response = re.sub(r'[\x00-\x1F\x7F]', '', response)  # Remove control characters

            # Extract the scores and justifications using regex
            methodology_match = re.search(r"1\. Methodology.*?Score: (\d+)/10\s+(.*?)2\. Coherence", cleaned_response, re.DOTALL)
            coherence_match = re.search(r"2\. Coherence.*?Score: (\d+)/10\s+(.*?)3\. Validity", cleaned_response, re.DOTALL)
            validity_match = re.search(r"3\. Validity.*?Score: (\d+)/10\s+(.*?)Overall Score", cleaned_response, re.DOTALL)
            overall_match = re.search(r"Overall Score: ([\d.]+)/10.*?Publishability:\s*(.*?)\n", cleaned_response, re.DOTALL)

            if not (methodology_match and coherence_match and validity_match and overall_match):
                raise ValueError(f"Missing components in the response: {cleaned_response}")

            return {
                "methodology": {
                    "score": int(methodology_match.group(1)),
                    "justification": methodology_match.group(2).strip()
                },
                "coherence": {
                    "score": int(coherence_match.group(1)),
                    "justification": coherence_match.group(2).strip()
                },
                "validity": {
                    "score": int(validity_match.group(1)),
                    "justification": validity_match.group(2).strip()
                },
                "overall": {
                    "score": float(overall_match.group(1)),
                    "publishable": overall_match.group(2).strip()
                }
            }

# Loading and Chunking the Paper
def load_and_chunk_paper(file_path: str, chunk_size: int = 500, overlap: int = 50) -> str:
    """Load a research paper and split it into manageable chunks."""
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    full_text = " ".join([page.page_content for page in pages])
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = splitter.split_text(full_text)
    return " ".join(chunks)  # Combine chunks for processing

if __name__ == "__main__":
    chat_model = ChatGroq(groq_api_key=userdata.get("GROQ_API_KEY").strip(), model="llama3-8b-8192")

    # Load and chunk the research paper
    paper_path = "/content/drive/MyDrive/KDSH_2025_Dataset/Reference/Publishable/CVPR/R006.pdf"
    paper_text = load_and_chunk_paper(paper_path)

    # Publishability Evaluation
    publishability_evaluator = PublishabilityEvaluator(chat_model=chat_model)
    publishability_result = publishability_evaluator.evaluate_publishability(paper_text)

    if publishability_result.publishable:
        # Conference Selection
        agents = [
            ConferenceAgent(name="Agent CVPR", chat_model=chat_model, target_conference="CVPR", conference_themes = """
                        CVPR focuses on the field of computer vision, image processing, and pattern recognition. Key themes include:
                          Object detection and recognition.
                          Image segmentation and scene understanding.
                          Visual tracking and motion analysis.
                          3D vision, stereo vision, and depth estimation.
                          Deep learning for vision tasks (CNNs, Vision Transformers, etc.).
                          Applications in medical imaging, autonomous vehicles, and robotics.
                          Video processing and understanding.
                          Computational photography and imaging.
                          Low-level vision (denoising, super-resolution).
                          Vision-based augmented reality and virtual reality.
                        """,
                        conference_context = """
                        Examples of accepted papers:
                          - YOLO: Real-Time Object Detection
                              Summary: Proposes an efficient deep learning model for real-time object detection with state-of-the-art accuracy.
                          - SegNet: A Deep Convolutional Encoder-Decoder Architecture for Image Segmentation
                              Summary: Focuses on pixel-wise image segmentation using deep neural networks, with applications in autonomous systems.
                          - 3D Object Detection and Localization Using RGB and Depth Data
                              Summary: Combines RGB and depth information for accurate 3D object detection in indoor environments.
                        Evaluation Criteria:
                         - Relevance to core computer vision topics.
                         - Advances in methodology or application.
                         - Practical implications for industries like healthcare, robotics, and transportation.
                        """),
            ConferenceAgent(name="Agent NeurIPS", chat_model=chat_model, target_conference="NeurIPS"  ,conference_themes ="""
                        NeurIPS emphasizes machine learning, computational neuroscience, and AI-related topics. Key themes include:
                            Deep learning (e.g., architectures, generative models, optimization for deep networks, foundation models, LLMs)
                            Reinforcement learning and decision-making.
                            Probabilistic models and Bayesian learning.
                            Optimization techniques for machine learning.
                            Representation learning and embeddings.
                            AI for healthcare, climate science, and sustainability.
                            Neuroscience-inspired algorithms and theories.
                            Ethical AI, fairness, and explainability.
                        """,
                        conference_context = """
                        Examples of accepted papers:
                          - Attention Is All You Need
                             Summary: Introduces the transformer architecture, which revolutionized NLP and other domains by using self-attention mechanisms.
                          - Generative Adversarial Networks (GANs)
                             Summary: Proposes a novel framework for training generative models using adversarial networks.
                          - Adam: A Method for Stochastic Optimization
                             Summary: Proposes a new optimization algorithm that balances convergence speed and stability in deep learning.
                        Evaluation Criteria:
                          - Novelty and theoretical contributions.
                          - Experimental rigor and reproducibility.
                          - Broader implications for machine learning and interdisciplinary applications.
                        """),
            ConferenceAgent(name="Agent EMNLP", chat_model=chat_model, target_conference="EMNLP" ,conference_themes = """
                        EMNLP specializes in NLP and computational linguistics. Key themes include:
                          Machine translation and cross-lingual NLP.
                          Large language models and foundational models.
                          Sentiment analysis and opinion mining.
                          Dialogue systems and conversational AI.
                          Question answering and information retrieval.
                          Text summarization and abstraction.
                          Morphology, syntax, and semantics.
                          Multimodal NLP (text + image/audio fusion).
                          Ethical concerns in NLP (bias, toxicity detection).
                          Low-resource and multilingual NLP.
                        """,
                        conference_context = """
                        Examples of accepted papers:

Paper Title: Pre-trained Language Models for Text-to-Text Generation
Summary: This paper explores pre-trained models (e.g., T5) for a variety of text-to-text NLP tasks, showcasing improvements across summarization, question answering, and translation.

Paper Title: Adversarial Training for Robust Text Classification
Summary: Proposes an adversarial training framework that enhances the robustness of text classification models against noisy and adversarial inputs.

Paper Title: Knowledge-Enhanced Contextual Representations for Entity Linking
Summary: Combines external knowledge sources with contextual embeddings to improve entity linking in complex domains.

Paper Title: Multimodal Sentiment Analysis with Attention Mechanisms
Summary: Introduces a framework for combining visual and textual inputs to improve sentiment detection in videos.

Evaluation Criteria:
Relevance to NLP:

Papers should directly address core NLP tasks, methodologies, or applications.
Emphasis on empirical evaluation and innovation in language processing methods.
Quality of Methodology:

Rigorous experiments, proper baselines, and thorough ablation studies.
Use of diverse and large-scale datasets to validate results.
Novelty of Contribution:

New architectures, algorithms, or findings that advance the field.
Extensions of existing methods to novel tasks or domains.
Broader Impacts:

Ethical implications of the research (e.g., bias in models, data privacy).
Potential for cross-disciplinary applications (e.g., in healthcare, education, or social media).
Themes Breakdown with Examples:
Natural Language Understanding:

Semantic role labeling, coreference resolution, and discourse parsing.
Example: A New Framework for Semantic Parsing Using Pre-Trained Transformers.
Language Generation:

Dialogue systems, machine translation, and creative text generation.
Example: Controlled Text Generation with Discrete and Continuous Latent Variables.
Information Extraction:

Named entity recognition, relation extraction, and knowledge graph construction.
Example: Joint Entity and Relation Extraction with Transformer-Based Models.
Multimodal NLP:

Integrating vision, audio, and textual modalities.
Example: Aligning Text and Vision for Multimodal Machine Translation.
Social and Ethical Considerations:

Bias, fairness, interpretability, and environmental impacts of NLP models.
Example: Bias Mitigation in Pre-trained Models: A Data Augmentation Approach.

                        """),
            ConferenceAgent(name="Agent KDD", chat_model=chat_model, target_conference="KDD"  ,conference_themes = """
                        KDD centers on data mining, big data, and applied AI. Key themes include:
                            Scalable data mining algorithms.
                            Graph data and network analysis.
                            Temporal and sequential data mining.
                            Anomaly detection and predictive modeling.
                            Recommender systems and personalization.
                            Causal inference and counterfactual reasoning.
                            Applications in finance, e-commerce, and marketing.
                            Data visualization and interpretability.
                            Data ethics and privacy-preserving techniques.
                            AI and data-driven solutions for social good.
                        """,

                        conference_context="""
                        Examples of accepted papers:

Paper Title: Scalable Graph Neural Networks for Large-Scale Social Network Analysis
Summary: Introduces a scalable GNN framework that handles billion-scale graphs with high efficiency, showcasing applications in social network analysis.

Paper Title: Causal Discovery in High-Dimensional Data Using Deep Learning
Summary: Proposes a deep learning-based approach for identifying causal relationships in high-dimensional data.

Paper Title: Fair Representation Learning with Adversarial Networks
Summary: Develops a framework for learning fair data representations while minimizing demographic bias in downstream tasks.

Paper Title: Adaptive Online Learning for Real-Time Recommender Systems
Summary: Presents an adaptive online learning algorithm that updates recommender system models in real time based on user interactions.

Evaluation Criteria:
Relevance to Knowledge Discovery and Data Mining:

Papers must address core challenges in data mining, machine learning, or related applications.
Emphasis on practical applications and scalability to real-world data.
Methodological Rigor:

Strong theoretical foundations or innovative empirical methodologies.
Comprehensive experiments with real-world datasets and benchmarks.
Scalability and Efficiency:

Solutions must handle large-scale data effectively, both in computation and memory.
Novelty and Impact:

New techniques, algorithms, or applications that significantly advance the state-of-the-art.
Real-world relevance and potential societal or economic impact.
Broader Implications:

Ethical considerations, such as fairness, transparency, and responsible use of data.
Long-term applicability across industries and domains.
                        """),
        ]

        storm_system = STORMSystem(agents=agents)
        decision = storm_system.discuss_and_decide(paper_text)

        print(f"Paper ID: {publishability_result.paper_id}")
        print(f"Overall Publishability: {publishability_result.publishable}")
        print(f"Best Conference: {decision['best_conference']}")
        print(f"Conference Justification: {decision['justification']}")
    else:
        print(f"Paper ID: {publishability_result.paper_id}")
        print("The paper is not deemed publishable.")


<ipython-input-5-0cc3953f7e89>:17: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  @validator('score')


ValueError: Missing some input keys: {'0, 1, . . . , N− 1', '1, 2, . . . , T', '0, 1, . . . , T− 1', '1, 2, . . . , N'}

In [None]:
pip install langchain_groq langgraph langchain_community pypdf


Collecting langchain_groq
  Downloading langchain_groq-0.2.2-py3-none-any.whl.metadata (3.0 kB)
Collecting langgraph
  Downloading langgraph-0.2.61-py3-none-any.whl.metadata (15 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting groq<1,>=0.4.1 (from langchain_groq)
  Downloading groq-0.13.1-py3-none-any.whl.metadata (14 kB)
Collecting langchain-core<0.4.0,>=0.3.27 (from langchain_groq)
  Downloading langchain_core-0.3.29-py3-none-any.whl.metadata (6.3 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.0.4 (from langgraph)
  Downloading langgraph_checkpoint-2.0.9-py3-none-any.whl.metadata (4.6 kB)
Collecting langgraph-sdk<0.2.0,>=0.1.42 (from langgraph)
  Downloading langgraph_sdk-0.1.48-py3-none-any.whl.metadata (1.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.meta

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


[31mERROR: Could not find a version that satisfies the requirement python==3.12 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for python==3.12[0m[31m
[0m