LogicNet-Subnet · LVH-Tony · Dec 17, 2024 · Dec 17, 2024
diff --git a/docs/VALIDATOR.md b/docs/VALIDATOR.md
@@ -37,6 +37,7 @@ We recommend using Together.AI to run the Validator, as it simplifies setup and
 #### Prerequisites:
 
 - **Account on Together.AI**: [Sign up here](https://together.ai/).
+- **Account on Hugging Face**: [Sign up here](https://huggingface.co/).
 - **API Key**: Obtain from the Together.AI dashboard.
 - **Python 3.10**
 - **PM2 Process Manager**: For running and managing the Validator process. *OPTIONAL*
@@ -70,6 +71,7 @@ We recommend using Together.AI to run the Validator, as it simplifies setup and
 4. **Set Up the `.env` File**
    ```bash
    echo "TOGETHER_API_KEY=your_together_ai_api_key" > .env
+   echo "HF_TOKEN=your_hugging_face_token" >> .env
    ```
 
 5. **Select a Model**
@@ -131,6 +133,9 @@ We recommend using Together.AI to run the Validator, as it simplifies setup and
 - Ensure your `TOGETHER_API_KEY` is correctly set and sourced:
   - Check the `.env` file: `cat .env`
   - Verify the API key is loaded: `echo $TOGETHER_API_KEY`
+- Ensure your `HF_TOKEN` is correctly set and sourced:
+  - Check the `.env` file: `cat .env`
+  - Verify the API key is loaded: `echo $HF_TOKEN`
 - The `--llm_client.base_url` should be `https://api.together.xyz/v1`.
 - Match `--llm_client.model` with the **Model ID** from Together.AI.
 
@@ -177,7 +182,12 @@ This method involves self-hosting a vLLM r to run the Validator locally. It requ
    ```
    *Adjust the model, port, and host as needed.*
 
-5. **Run the Validator with Self-Hosted LLM**
+5. **Set Up the `.env` File**
+   ```bash
+   echo "HF_TOKEN=your_hugging_face_token" > .env
+   ```
+
+6. **Run the Validator with Self-Hosted LLM**
    - **Activate Virtual Environment**:
      ```bash
      . main/bin/activate
@@ -194,7 +204,7 @@ This method involves self-hosting a vLLM r to run the Validator locally. It requ
        --logging.debug
      ```
 
-6. **(Optional) Enable Public Access**
+7. **(Optional) Enable Public Access**
    ```bash
    --axon.port "your-public-open-port"
    ```
@@ -226,11 +236,12 @@ This method involves self-hosting a vLLM r to run the Validator locally. It requ
 
 - **Common Issues**:
   - **API Key Not Found**: Ensure `.env` is sourced and `TOGETHER_API_KEY` is set.
+  - **HF Token Not Found**: Ensure `.env` is sourced and `HF_TOKEN` is set.
   - **Model ID Incorrect**: Verify the `--llm_client.model` matches the Together.AI Model ID.
   - **Connection Errors**: Check internet connectivity and Together.AI service status.
 
 - **Contact Support**: Reach out to the LogicNet support team for assistance.
 
 ---
 
-Happy Validating!
+Happy Validating!
diff --git a/logicnet/utils/config.py b/logicnet/utils/config.py
@@ -189,6 +189,13 @@ def add_args(cls, parser):
             default="xyz",
         )
 
+        parser.add_argument(
+            "--dataset_weight",
+            type=str,
+            help="The weight of the dataset",
+            default="40,10,10,10,10,10,10",
+        )
+
     else:
         parser.add_argument(
             "--miner.category",
@@ -250,4 +257,4 @@ def config(cls):
     bt.logging.add_args(parser)
     bt.axon.add_args(parser)
     cls.add_args(parser)
-    return bt.config(parser)
+    return bt.config(parser)
diff --git a/logicnet/validator/challenger/challenger.py b/logicnet/validator/challenger/challenger.py
@@ -1,53 +1,153 @@
 # Challenge for Synthetic Request
+import os
 import openai
 import random
 from logicnet.protocol import LogicSynapse
 import bittensor as bt
 from .human_noise import get_condition
 from .math_generator.topics import TOPICS as topics
 import mathgenerator
+from datasets import load_dataset
 
+DATASET_WEIGHT = [40,10,10,10,10,10,10]
 
 class LogicChallenger:
-    def __init__(self, base_url: str, api_key: str, model: str):
+    def __init__(self, base_url: str, api_key: str, model: str, dataset_weight: list):
         bt.logging.info(
-            f"Logic Challenger initialized with model: {model}, base_url: {base_url}"
+            f"Initializing Logic Challenger with model: {model}, base URL: {base_url}."
         )
         self.model = model
         self.openai_client = openai.OpenAI(base_url=base_url, api_key=api_key)
+        self.dataset_weight = [float(weight) for weight in dataset_weight.split(',')]
 
     def __call__(self, synapse: LogicSynapse) -> LogicSynapse:
         self.get_challenge(synapse)
         return synapse
 
     def get_challenge(self, synapse: LogicSynapse):
-        logic_problem = self.get_atom_math_problem(synapse)
+        # Get an atom logic problem
+        atom_logic_question, atom_logic_answer = self.get_atom_logic_problem()
+        if atom_logic_question is None or atom_logic_answer is None:
+            bt.logging.error("Unable to retrieve atom logic problem. Retrying...")
+            atom_logic_question, atom_logic_answer = self.get_atom_logic_problem()
+
+        # Revise the problem
         conditions: dict = get_condition()
-        revised_logic_question: str = self.get_revised_math_question(
-            logic_problem, conditions
+        revised_logic_question: str = self.get_revised_logic_question(
+            atom_logic_question, conditions
         )
+
+        # Set the synapse with the atom problem
+        synapse.raw_logic_question = atom_logic_question
+        synapse.ground_truth_answer = str(atom_logic_answer).replace("$", "").strip()
         synapse.logic_question = revised_logic_question
 
-    def get_atom_math_problem(self, synapse: LogicSynapse) -> str:
-        selected_topic = random.choice(topics)
-        subtopic = selected_topic["subtopic"]
-        topic = selected_topic["topic"]
-        bt.logging.debug(f"Using {mathgenerator.__name__} to generate math problem")
-        atom_problem, atom_answer = eval(f"mathgenerator.{topic}.{subtopic}()")
-        subtopic = subtopic.replace("_", " ").capitalize()
-        topic = topic.replace("_", " ").capitalize()
-        atom_problem = atom_problem.replace("$", "").strip()
-        atom_problem = f"Find the solution of this math problem:\n---\nTopic: {topic}, Subtopic: {subtopic}.\n{atom_problem}\n---\n"
-        bt.logging.debug(f"Generated atom math problem: {atom_problem}")
-        synapse.raw_logic_question = atom_problem
+    def get_atom_logic_problem(self) -> str:
+        resources = ['mathgenerator', 'zebralogicbench-grid', 'zebralogicbench-mc', 'ultrainteract', 'gsm8k', 'mmlustem', 'satmath']
+        if len(self.dataset_weight) == 7:
+            selected_resource = random.choices(resources, weights=self.dataset_weight, k=1)[0]
+        else:
+            bt.logging.warning("Invalid dataset weight configuration provided. Using default weights.")
+            selected_resource = random.choices(resources, weights=DATASET_WEIGHT, k=1)[0]
+
+        try:
+            # Select an atom question and answer from the Mathgenerator
+            if selected_resource == 'mathgenerator':
+                selected_topic = random.choice(topics)
+                subtopic = selected_topic["subtopic"]
+                topic = selected_topic["topic"]
+                atom_question, atom_answer = eval(f"mathgenerator.{topic}.{subtopic}()")
+                if atom_question is None or atom_answer is None:
+                    raise ValueError("Failed to get atom logic problem")
+                bt.logging.debug("Generating math problem using Mathgenerator.")
+                subtopic = subtopic.replace("_", " ").capitalize()
+                topic = topic.replace("_", " ").capitalize()
+                atom_question = atom_question.replace("$", "").strip()
+                atom_question = f"Find the solution of this math problem:\n---\nTopic: {topic}, Subtopic: {subtopic}.\n{atom_question}\n---\n"
+
+            # Select an atom question and answer from the ZebraLogicBench grid_mode
+            elif selected_resource == 'zebralogicbench-grid':
+                ds_grid = load_dataset("allenai/ZebraLogicBench-private", "grid_mode", token=os.environ.get('HF_TOKEN'))
+                bt.logging.debug("Generating problem using ZebraLogicBench (grid mode).")
+                data_set_grid = ds_grid['test']
+                bt.logging.info(f"Loaded ZebraLogicBench (grid_mode) dataset with {len(data_set_grid['puzzle'])} entries")
+                random_index = random.randint(0, len(data_set_grid['puzzle']) - 1)
+                puzzle = data_set_grid['puzzle'][random_index]
+                answer = data_set_grid['solution'][random_index]
+                atom_question = f"Find the solution of this problem:\n---\n{puzzle}\n---\n"
+                atom_answer = answer
+
+            # Select an atom question and answer from the ZebraLogicBench mc_mode
+            elif selected_resource == 'zebralogicbench-mc':
+                ds_mc = load_dataset("allenai/ZebraLogicBench-private", "mc_mode", token=os.environ.get('HF_TOKEN'))
+                bt.logging.debug("Generating problem using ZebraLogicBench (multiple choice mode).")
+                data_set_mc = ds_mc['test']
+                bt.logging.info(f"Loaded ZebraLogicBench (mc_mode) dataset with {len(data_set_mc['puzzle'])} entries")
+                random_index = random.randint(0, len(data_set_mc['puzzle']) - 1)
+                puzzle = data_set_mc['puzzle'][random_index]
+                question = data_set_mc['question'][random_index]
+                answer = data_set_mc['answer'][random_index]
+                atom_question = f"Find the solution of this problem:\n---\n{puzzle}\n---\n{question}\n---\n"
+                atom_answer = answer
 
-        synapse.ground_truth_answer = str(atom_answer).replace("$", "").strip()
+            # Select an atom question and answer from the UltraInteract
+            elif selected_resource == 'ultrainteract':
+                ds = load_dataset("openbmb/UltraInteract_sft")
+                bt.logging.debug("Generating problem using UltraInteract dataset.")
+                data_set = ds['train']
+                bt.logging.info(f"Loaded UltraInteract dataset with {len(data_set['instruction'])} entries")
+                random_index = random.randint(0, len(data_set['instruction']) - 1)
+                instruction = data_set['instruction'][random_index]
+                response = data_set['response'][random_index]
+                atom_question = f"Find the solution of this instruction:\n---\n{instruction}\n---\n"
+                atom_answer = response
+
+            # Select an atom question and answer from the GSM8K
+            elif selected_resource == 'gsm8k':
+                ds = load_dataset("openai/gsm8k", "main")
+                bt.logging.debug("Generating problem using GSM8K dataset.")
+                data_set = ds['train']
+                bt.logging.info(f"Loaded GSM8K dataset with {len(data_set['question'])} entries")
+                random_index = random.randint(0, len(data_set['question']) - 1)
+                question = data_set['question'][random_index]
+                answer = data_set['answer'][random_index]
+                atom_question = f"Find the solution of this question:\n---\n{question}\n---\n"
+                atom_answer = answer
 
-        bt.logging.debug(f"Generated atom math answer: {atom_answer}")
+            # Select an atom question and answer from the MMLU-STEM
+            elif selected_resource == 'mmlustem':
+                ds = load_dataset("TIGER-Lab/MMLU-STEM")
+                bt.logging.debug("Generating problem using MMLU-STEM dataset.")
+                data_set = ds['test']
+                bt.logging.info(f"Loaded MMLU-STEM dataset with {len(data_set['question'])} entries")
+                random_index = random.randint(0, len(data_set['question']) - 1)
+                question = data_set['question'][random_index]
+                answer_id = data_set['answer'][random_index]
+                answer_choice = data_set['choices'][random_index]
+                atom_question = f"Find the solution of this question:\n---\n{question}\n---\n"
+                atom_answer = answer_choice[answer_id]
 
-        return atom_problem
+            # Select an atom question and answer from the SAT Math
+            elif selected_resource == 'satmath':
+                ds = load_dataset("mcaleste/sat_multiple_choice_math_may_23")
+                bt.logging.debug("Generating problem using SAT Math dataset.")
+                data_set = ds['train']
+                bt.logging.info(f"Loaded SAT Math dataset with {len(data_set['Question'])} entries")
+                random_index = random.randint(0, len(data_set['Question']) - 1)
+                question = data_set['Question'][random_index]
+                possible_answers = data_set['Possible Answers'][random_index]
+                answer_id = data_set['Answer'][random_index]
+                atom_question = f"Find the solution of this question:\n---\n{question}\n---\n"
+                atom_answer = self.get_answer_value(possible_answers, answer_id)
 
-    def get_revised_math_question(self, math_problem: str, conditions: dict) -> str:
+        except Exception as e:
+            bt.logging.error(f"Error accessing dataset {selected_resource}: {e}. Attempting to load an alternative dataset.")
+            # Retry with a different dataset
+            return self.get_atom_logic_problem()  
+
+        return atom_question, atom_answer
+
+    def get_revised_logic_question(self, logic_question: str, conditions: dict) -> str:
         # prompt = "Please paraphrase by adding word or expression to this question as if you were a {profile} who is {mood} and write in a {tone} tone. You can use incorrect grammar, typo or add more context! Don't add your solution! Just say the revised version, you don't need to be polite.".format(
         #     **conditions
         # )
@@ -58,7 +158,6 @@ def get_revised_math_question(self, math_problem: str, conditions: dict) -> str:
             "Do not include the solution or add unnecessary context."
         ).format(**conditions)
 
-        bt.logging.debug(f"Revising prompt: {prompt}")
 
         # messages = [
         #     {
@@ -81,7 +180,7 @@ def get_revised_math_question(self, math_problem: str, conditions: dict) -> str:
                     "ensuring the question sounds natural and appropriate for that individual."
                 ),
             },
-            {"role": "assistant", "content": math_problem},
+            {"role": "assistant", "content": logic_question},
             {"role": "user", "content": prompt},
         ]
 
@@ -93,6 +192,13 @@ def get_revised_math_question(self, math_problem: str, conditions: dict) -> str:
         )
 
         response = response.choices[0].message.content.strip()
-        bt.logging.debug(f"Generated revised math question: {response}")
         return response
-
+
+
+    def get_answer_value(self, possible_answers, answer):
+        # Get the value of the answer from the possible answers
+        options = possible_answers.split()
+        for i, option in enumerate(options):
+            if option.startswith(answer + ")"):
+                return options[i + 1]
+        return None  # Return None if the answer is not found
diff --git a/logicnet/validator/miner_manager.py b/logicnet/validator/miner_manager.py
@@ -128,7 +128,6 @@ def get_miner_uids(self, category: str):
         """
         Get miner uids based on category, useful if subnet has multiple categories
         """
-        print(self.all_uids_info)
         available_uids = [
             int(uid)
             for uid in self.all_uids_info.keys()
@@ -181,4 +180,4 @@ def get_model_specific_weights(self, category, normalize=True):
             # Normalizing the tensor
             if tensor_sum > 0:
                 model_specific_weights = model_specific_weights / tensor_sum
-        return model_specific_weights
+        return model_specific_weights
diff --git a/neurons/validator/validator.py b/neurons/validator/validator.py
@@ -23,6 +23,7 @@ def init_category(config=None):
                 config.llm_client.base_url,
                 config.llm_client.key,
                 config.llm_client.model,
+                config.dataset_weight,
             ),
             "rewarder": LogicRewarder(
                 config.llm_client.base_url,
@@ -126,11 +127,6 @@ def forward(self):
         # Update scores on chain
         self.update_scores_on_chain()
         self.save_state()
-        bt.logging.info(
-            "\033[1;32m✅ Loop completed, uids info:\n"
-            + str(self.miner_manager.all_uids_info).replace("},", "},\n")
-            + "\033[0m"
-        )
         self.store_miner_infomation()
 
         actual_time_taken = time.time() - loop_start
@@ -162,6 +158,7 @@ def async_query_and_reward(
                 continue
             base_synapse = synapse.copy()
             synapse = synapse.miner_synapse()
+            bt.logging.info(f"\033[1;34m🧠 Synapse to be sent to miners: {synapse}\033[0m")
             axons = [self.metagraph.axons[int(uid)] for uid in uids]
             bt.logging.debug(f"\033[1;34m🧠 Axon: {axons}\033[0m")
             responses = dendrite.query(
@@ -170,9 +167,10 @@ def async_query_and_reward(
                 deserialize=False,
                 timeout=self.categories[category]["timeout"],
             )
-            bt.logging.debug(
-                f"\033[1;34m🧠 Miner response: {responses[0].logic_answer}\033[0m"
-            )
+            # for response, uid in zip(responses, uids):
+            #     bt.logging.debug(
+            #         f"\033[1;34m🧠 Miner response for {uid}: {response.logic_answer}\033[0m"
+            #     )
             reward_responses = [
                 response
                 for response, should_reward in zip(responses, should_rewards)
@@ -391,4 +389,4 @@ def convert_to_serializable(data):
     with Validator() as validator:
         while True:
             bt.logging.info("\033[1;32m🟢 Validator running...\033[0m", time.time())
-            time.sleep(360)
+            time.sleep(360)
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,5 @@ sentence-transformers==3.0.1
 python-dotenv==1.0.1
 sympy
 wandb
+datasets
 git+https://github.com/lukew3/mathgenerator.git