In [None]:
import requests
import os

# Define the values (you can hardcode or use os.environ for secrets)
TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")  # or paste your Together API key here
HF_TOKEN = os.environ.get("HF_TOKEN")  # or paste your Hugging Face token here

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {TOGETHER_API_KEY}"
}

data = {
    "model_name": "test-lora-model-creation-8b",
    "model_source": "https://huggingface.co/mahmad1882/llama3-8b-instruct-verification-lora",
    "model_type": "adapter",
    "description": "test_lora_8B",
    "hf_token": HF_TOKEN
}

response = requests.post("https://api.together.xyz/v0/models", headers=headers, json=data)

print("Status:", response.status_code)
print("Response:", response.json())


Status: 200
Response: {'data': {'job_id': 'job-aaf23828-0950-4931-92cb-10fedc618c3d', 'model_name': 'muhammadahmad1/test-lora-model-creation-8b', 'model_id': 'endpoint-dc405e1c-9ee8-464a-9436-972e0c59d0c0', 'model_source': 'huggingface'}, 'message': 'Processing model weights. Job created.'}


In [18]:
import json
job_id = 'job-aaf23828-0950-4931-92cb-10fedc618c3d'

# Setup headers
headers = {
    "Authorization": f"Bearer {TOGETHER_API_KEY}"
}

# Send GET request
response = requests.get(f"https://api.together.xyz/v1/jobs/{job_id}", headers=headers)

# Pretty print the JSON response
print(json.dumps(response.json(), indent=2))

{
  "type": "adapter_upload",
  "job_id": "job-aaf23828-0950-4931-92cb-10fedc618c3d",
  "status": "Complete",
  "status_updates": [
    {
      "status": "Queued",
      "message": "Job has been created",
      "timestamp": "2025-05-03T12:41:58Z"
    },
    {
      "status": "Running",
      "message": "Received job from queue, starting",
      "timestamp": "2025-05-03T12:42:21Z"
    },
    {
      "status": "Running",
      "message": "Adapter download in progress",
      "timestamp": "2025-05-03T12:42:21Z"
    },
    {
      "status": "Running",
      "message": "Adapter bf16 conversion in progress",
      "timestamp": "2025-05-03T12:42:24Z"
    },
    {
      "status": "Running",
      "message": "Adapter validation in progress",
      "timestamp": "2025-05-03T12:42:24Z"
    },
    {
      "status": "Running",
      "message": "Adapter upload in progress",
      "timestamp": "2025-05-03T12:42:24Z"
    },
    {
      "status": "Running",
      "message": "Model entity update in progr

In [None]:
import time
MODEL_NAME_FOR_INFERENCE = "muhammadahmad1/test-lora-model-creation-8b"

# Step 1: Submit the inference job
headers = {
    "Authorization": f"Bearer {TOGETHER_API_KEY}",
    "Content-Type": "application/json"
}


quiz = "Quiz on Supervised Learning\n\n1. MCQ: What is the primary goal of supervised learning?\nA) Predict outcomes for new data\nB) Cluster data into groups\nC) Reduce data dimensionality\nD) Generate new data\nCorrect: A) Predict outcomes for new data. Reason: Supervised learning trains models on labeled data to predict outcomes for unseen inputs.\n\n2. MCQ: Which algorithm minimizes the cost function using gradient descent?\nA) K-Means\nB) Linear Regression\nC) DBSCAN\nD) Apriori\nCorrect: B) Linear Regression. Reason: Linear regression uses gradient descent to optimize the cost function for best-fit parameters.\n\n3. MCQ: What is overfitting in supervised learning?\nA) Model performs well on training but poorly on test data\nB) Model is too simple to capture patterns\nC) Model ignores noise in data\nD) Model predicts perfectly on all data\nCorrect: A) Model performs well on training but poorly on test data. Reason: Overfitting occurs when a model learns noise in training data, reducing generalization.\n\n4. True/False: A decision tree can only handle numerical data.\nFalse. Reason: Decision trees can handle both numerical and categorical data by splitting based on feature values.\n\n5. True/False: Supervised learning requires labeled data.\nTrue. Reason: Supervised learning relies on input-output pairs to train models.\n\n6. Short Answer: Explain the bias-variance tradeoff in 1–2 sentences.\nAnswer: The bias-variance tradeoff balances model complexity; high bias (simple models) may underfit, while high variance (complex models) may overfit, both affecting generalization.\n\n7. Short Answer: Compare batch and stochastic gradient descent in 1–2 sentences.\nAnswer: Batch gradient descent computes gradients over the entire dataset, which is stable but slow, while stochastic gradient descent uses one sample at a time, which is faster but noisier.\n\n8. MCQ: Which metric is used to evaluate classification models?\nA) Mean Squared Error\nB) Accuracy\nC) R-squared\nD) Mean Absolute Error\nCorrect: B) Accuracy. Reason: Accuracy measures the proportion of correct predictions in classification.\n\n9. Short Answer: Describe a real-world application of supervised learning in healthcare.\nAnswer: Supervised learning can be used to predict disease outcomes based on patient data, such as classifying whether a patient has a particular condition.\n\n10. Short Answer: Explain why precision and recall are important in imbalanced classification problems.\nAnswer: In imbalanced datasets, accuracy can be misleading; precision and recall provide better insights into model performance on minority classes."

feedback_prompt = "Feedback: Include questions on evaluation metrics and real-world applications. If the given feedback has NOT been FULLY incorporated, PENALIZE HARSHLY."
prompt = f'''
 You are a meticulous and brutally honest educator tasked with dissecting and evaluating the quality of a student-designed quiz meant to assess theoretical understanding and practical application of topics taught in class.
 Your job is not to merely review but to interrogate every choice made in the quiz—question wording, content selection, cognitive depth, and structure—with a fine-toothed comb.
 Challenge every assumption. Be hyper-critical and assume nothing is adequate unless proven through rigor, clarity, and flawless execution.
 Expose ambiguity, shallowness, redundancy, poor alignment with learning objectives, and any missed opportunities—no matter how subtle.
 Even if the quiz seems serviceable, your goal is to highlight weaknesses in coverage, depth, construction, and learning value.
 Never default to leniency. **Demand perfection, especially in the absence of prior feedback.**
 The quiz content is below:
        =====================
        {quiz}
        =====================

        EVALUATE it STRICTLY based on the following criteria. Assign a score out of 10 for each and justify with detail.
        
        For each criterion, do the following:
        1. Give a score out of 10.
        2. Justify the score in detail.

        1. **Clarity and Relevance**:
          - Are the questions clearly worded and free from ambiguity?
          - Are they appropriate for the level of the course and relevant to topics taught?
          - Do they reflect the expected knowledge and skill level of students?

        2. **Coverage of Concepts**:
          - Does the quiz cover a diverse and representative set of concepts taught?
          - Are both theoretical and practical aspects of the topic included?
          - Does it balance breadth and depth appropriately?

        3. **Question Quality and Structure**:
          - Does the quiz contain the required 3-5 MCQs, 2-3 True/False statements, and 4-5 short/medium-length questions?
          - Are MCQs structured well with plausible distractors?
          - Are True/False statements precise and unambiguous?
          - Are short/medium questions open-ended enough to assess understanding, but focused enough to guide students?

        4. **Cognitive Depth and Usefulness**:
          - Do questions vary in difficulty and promote higher-order thinking (not just recall)?
          - Are there any case-based or real-world application questions?
          - Does it test understanding, analysis, and application?

        5. **Task Redundancy / Overlap**:
          - Tasks should be distinct and may be divided into subtasks if complex.
          - Avoid repetition and ensure flow and progression in learning.

        6. **Feedback Incorporation**:
          - {feedback_prompt}
          - If this is the first draft (i.e. no previous feedback), be EXTREMELY CRITICAL in other fields. ONLY give scores above 7 for criteria that are exceptionally well-executed with zero flaws. Assume perfection is expected on first draft to drive improvement.

         At the end, write the following in one line ... [[[REVIEW_SCHEME]]] = {{ 'clarity': CLARITY_SCORE, 'coverage': COVERAGE_SCORE, 'structure': STRUCTURE_SCORE, 'overlap': OVERLAP_SCORE, 'depth': DEPTH_SCORE, 'feedback': FEEDBACK_SCORE }}
'''

payload = {
    "model": MODEL_NAME_FOR_INFERENCE,
    "prompt": prompt,
    "temperature": 0.8,
    "max_tokens": 1000
}

response = requests.post("https://api.together.xyz/v1/completions", headers=headers, json=payload)

if response.status_code != 200:
    raise Exception(f"Error submitting job: {response.status_code}, {response.text}")

completion_output = response.json().get("choices", [{}])[0].get("text", "").strip()
if not completion_output:
    raise Exception("No output returned by the model.")

if completion_output:
    print("\n🧠 Model Response:")
    print(completion_output)
else:
    print("⚠️ No output returned by the model.")


🧠 Model Response:
```
1. Clarity and Relevance: 8/10. Questions are clear but one MCQ on gradient descent is too simplistic.
2. Coverage of Concepts: 9/10. Comprehensive but lacks questions on evaluation metrics or detailed applications.
3. Question Quality and Structure: 9/10. Structured well with one poorly worded MCQ on gradient descent.
4. Cognitive Depth and Usefulness: 8/10. Does not challenge deeply with overly basic new question.
5. Task Redundancy / Overlap: 9/10. Distinct but one new question is too shallow.
6. Feedback Incorporation: 5/10. Failed to include new question types despite detailed instructions.
[[[REVIEW_SCHEME]]] = { 'clarity':8,'coverage':9,'structure':9,'overlap':9,'depth':8,'feedback':5 } = 7.4/10. Reviewer is disappointed with lack of critical engagement with detailed feedback, particularly regarding question type and depth. Reviewer scored high for clarity and coverage due to the inclusion of application and theory questions but emphasized the need for dee