In [2]:
!pip install transformers
!pip install datasets
!pip install sentence_transformers
!pip install neo4j
import torch
from PIL import Image
from datasets import load_dataset
from transformers import AutoProcessor, LlavaForConditionalGeneration
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
import numpy as np
import json
from sklearn.model_selection import train_test_split


class MathVisionNeo4jRAG:
    def __init__(self,
                 neo4j_uri: str = "bolt://localhost:7687",
                 neo4j_user: str = "neo4j",
                 neo4j_password: str = "password",
                 model_path: str = "llava-v1.5-7b"):
        """
        Initialize Neo4j-based Graph RAG for Math Vision
        """
        # Neo4j connection
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

        # LLaVA model
        self.model_id = "llava-hf/llava-1.5-7b-hf"

        self.model = LlavaForConditionalGeneration.from_pretrained(
            self.model_id,
            torch_dtype=torch.float16,
            device_map='auto'
        )
        self.processor = AutoProcessor.from_pretrained(self.model_id) # LlavaProcessor.from_pretrained(self.model_id)

        # Embedding model
        self.text_encoder = SentenceTransformer('all-MiniLM-L6-v2')

    def create_math_problem_graph(self, dataset):
        """
        Populate Neo4j graph with math problems

        Args:
            dataset: Hugging Face dataset split for training
        """
        with self.driver.session() as session:
            # Create constraints and indexes
            session.run("""
                CREATE CONSTRAINT unique_problem IF NOT EXISTS
                FOR (p:Problem) REQUIRE p.id IS UNIQUE
            """)

            # Insert problems and concepts
            for idx, problem in enumerate(dataset):
                # Embed problem text
                question_text = problem['question']
                solution_text = problem['solution']
                question_embedding = self.text_encoder.encode(question_text).tolist()
                solution_embedding = self.text_encoder.encode(solution_text).tolist()

                # Extract concepts (simplified)
                concepts = self._extract_concepts(solution_text)

                # Create problem node with embeddings
                session.run("""
                    MERGE (p:Problem {id: $problem_id})
                    SET p.question = $question
                    SET p.solution = $solution
                    SET p.question_embedding = $question_embedding
                    SET p.solution_embedding = $solution_embedding
                """, {
                    'problem_id': str(idx),
                    'question': question_text,
                    'solution': solution_text,
                    'question_embedding': question_embedding,
                    'solution_embedding': solution_embedding
                })

                # Create concept nodes and relationships
                for concept in concepts:
                    session.run("""
                        MERGE (c:Concept {name: $concept})
                        MERGE (p:Problem {id: $problem_id})-[:HAS_CONCEPT]->(c)
                    """, {
                        'concept': concept,
                        'problem_id': str(idx)
                    })

    def evaluate_on_test_set(self, test_dataset):
        """
        Evaluate the RAG system on test dataset

        Args:
            test_dataset: Test split of the dataset

        Returns:
            Performance metrics
        """
        correct_solutions = 0
        total_problems = len(test_dataset)

        for problem in test_dataset:
            # Retrieve similar problems
            similar_problems = self.retrieve_similar_problems(problem['question'])

            # Dummy evaluation (replace with actual solution comparison)
            # This is a placeholder - you'd need more sophisticated solution comparison
            if similar_problems:
                correct_solutions += 1

        return {
            'accuracy': correct_solutions / total_problems,
            'total_problems': total_problems
        }

    def retrieve_similar_problems(self, query_text: str, k: int = 3):
        """
        Retrieve similar problems using vector similarity and graph structure
        """
        query_embedding = self.text_encoder.encode(query_text).tolist()

        with self.driver.session() as session:
            results = session.run("""
                CALL db.index.vector.queryNodes('problem_question_embedding', $k, $embedding)
                YIELD node, score
                WITH node, score
                MATCH (node)-[:HAS_CONCEPT]->(c:Concept)
                RETURN node.id AS problem_id,
                       node.question AS question,
                       node.solution AS solution,
                       COLLECT(c.name) AS concepts,
                       score
                ORDER BY score DESC
                LIMIT $k
            """, {'k': k, 'embedding': query_embedding})

            return [
                {
                    'problem_id': record['problem_id'],
                    'question': record['question'],
                    'solution': record['solution'],
                    'concepts': record['concepts'],
                    'score': record['score']
                }
                for record in results
            ]

    def _extract_concepts(self, solution: str):
        """Extract mathematical concepts from solution"""
        math_concepts = [
            'algebra', 'geometry', 'trigonometry',
            'calculus', 'linear algebra', 'statistics'
        ]
        return [concept for concept in math_concepts
                if concept.lower() in solution.lower()]

# def main():
# Load dataset
ds = load_dataset("MathLLMs/MathVision")

# Initialize solver
math_solver = MathVisionNeo4jRAG()

# Build graph from training data
math_solver.create_math_problem_graph(ds['test'])

# Evaluate on test set
results = math_solver.evaluate_on_test_set(ds['testmini'])

print("Evaluation Results:", results)

# if __name__ == "__main__":
#     main()



Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.93k [00:00<?, ?B/s]

(…)-00000-of-00001-3532b8d3f1b4047a.parquet:   0%|          | 0.00/57.0M [00:00<?, ?B/s]

(…)-00000-of-00001-f8ff70fcb2f29b1d.parquet:   0%|          | 0.00/6.99M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/3040 [00:00<?, ? examples/s]

Generating testmini split:   0%|          | 0/304 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'question', 'options', 'image', 'decoded_image', 'answer', 'solution', 'level', 'subject'],
    num_rows: 3040
})


OSError: llava-v1.5-7b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`