# Gemma Model

In [None]:
!pip install -U llama-cpp-python

In [None]:
!pip install llama-cpp-python

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Capston_Project/Training_Data

/content/drive/MyDrive/Capston_Project/Training_Data


### Libraries

In [3]:
import json
from typing import List, Dict, Any, Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

from sklearn.metrics import confusion_matrix, classification_report

## Local Inference on GPU
Model page: https://huggingface.co/google/gemma-2b-it

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/google/gemma-2b-it)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

The model you are trying to use is gated. Please make sure you have access to it by visiting the model page.To run inference, either set HF_TOKEN in your environment variables/ Secrets or run the following cell to login. 🤗

In [None]:
from huggingface_hub import login
login(new_session=False)

### Model Loading

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
model_gemma = AutoModelForCausalLM.from_pretrained(
    "google/gemma-3-1b-it",
    torch_dtype=torch.bfloat16
)

### Prompts

In [39]:
SYSTEM_PROMPT = "You are a careful academic assistant. Be precise and return strict JSON."

def build_detection_prompt(submission: str, few_shots: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """
    Academic Integrity Detector Prompt
    ----------------------------------
    Purpose:
        Classifies student submissions as Human, AI, or Hybrid (AI-assisted).

    Technique:
        - Role-based prompting
        - Few-shot support
        - CoT (reasoning encouraged but hidden from output)
        - Strict JSON schema output

    JSON Schema (strict):
        {
          "label": "Human|AI|Hybrid",
          "rationale": "1–3 short bullet points of evidence",
          "flags": ["style_inconsistency","high_verbatim","generic_phrasing","none"]
        }
    """
    # Build few-shot block
    shot_texts = []
    for s in few_shots:
        shot_texts.append(
            f'Submission: """{s.get("final_submission","")}"""\n'
            f'Your analysis (2–4 bullet points): <analysis>\n'
            f'Label: {s.get("label_type","")}\n'
        )
    examples_block = "\n\n".join(shot_texts) if shot_texts else "/* no examples available */"

    # User-facing content
    user = f"""
You are an AI text-source classifier for academic integrity.
Decide whether the student submission is Human, AI, or Hybrid (AI-assisted).

Guidelines:
- Consider discourse features (specificity, subjectivity, personal context), style consistency, local/global coherence, repetitiveness, and cliché patterns.
- Hybrid = meaningful human writing with some AI assistance (ideas, phrasing, structure), or explicit admission of mixed use.

Examples:
{examples_block}

Now analyze the NEW submission step by step and return STRICT JSON.
NEW submission:
\"\"\"{submission}\"\"\"\n
Think briefly, then answer only with the JSON object.
"""
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user},
    ]

In [40]:
def build_feedback_prompt(domain: str, assignment_prompt: str, rubric_text: str, submission: str) -> List[Dict[str, str]]:
    user = f"""
You are a supportive assessor. Provide actionable feedback aligned to the rubric.
Return a STRUCTURED report (no extraneous text).

Sections:
1) "overall_summary": 2–4 sentences on strengths and priorities.
2) "criteria_feedback": array of items, one per rubric criterion with fields:
   - "criterion_id"
   - "rating": one of ["excellent","good","average","needs_improvement","poor"]
   - "evidence": 1–3 bullet points citing concrete excerpt(s) or behaviors
   - "improvement_tip": one concrete next step

Context:
- Domain: {domain}
- Assignment prompt: {assignment_prompt}

Rubric (verbatim):
{rubric_text}

Student submission:
\"\"\"{submission}\"\"\"\n

Constraints:
- Be concise but specific. Do not invent rubric fields. If evidence is insufficient, say so.
- Output MUST be valid JSON with the exact top-level keys: overall_summary, criteria_feedback, suggested_grade.
"""
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user},
    ]


### Function to run model

In [41]:
def run_messages(model, tokenizer, messages, max_new_tokens=400):
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_ids_length = inputs.input_ids.shape[1]

    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=max_new_tokens,
        temperature=0.0,
        top_p=1.0,
        do_sample=False
    )

    gen_tokens = outputs[0, input_ids_length:]
    resp = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # Remove common wrappers
    resp = re.sub(r"^```(?:json)?", "", resp.strip(), flags=re.IGNORECASE | re.MULTILINE)
    resp = re.sub(r"```$", "", resp.strip(), flags=re.MULTILINE)

    try:
        start, end = resp.find("{"), resp.rfind("}") + 1
        if start != -1 and end > start:
            return json.loads(resp[start:end])
    except Exception:
        pass

    return {"error": "Could not parse output", "raw_output": resp}


### Function to Evaluate Model

In [42]:
def self_eval_prompt(rubric: Dict, essay: str, feedback: str) -> str:
    crit = [c.get('name','Criterion') for c in rubric.get('criteria',[])]
    crit_str = ", ".join(crit) if crit else "the rubric"

    return (
        "You are a strict but fair assessor. Rate how well the FEEDBACK addresses the rubric for the ESSAY.\n"
        "Rate on a 1-5 scale (integers only). Provide ONLY the number.\n\n"
        f"ESSAY:\n{essay}\n\nRUBRIC CRITERIA: {crit_str}\n\nFEEDBACK:\n{feedback}\n\nRATING (1-5): "
    )

### Selection from the data to balance each class

In [43]:
def select_balanced_few_shots(submissions: List[Dict[str, Any]], max_per_class: int = 1) -> List[Dict[str, Any]]:
    """
    Pick up to max_per_class examples from each label type.
    Ensures few-shots are not biased toward one label.
    """
    buckets = {"Human": [], "AI": [], "Hybrid": []}
    for s in submissions:
        label = s.get("label_type")
        if label in buckets and len(buckets[label]) < max_per_class:
            buckets[label].append(s)
    return buckets["Human"] + buckets["AI"] + buckets["Hybrid"]


### Content Detection and Feedback Generation

In [44]:
# class for evaluating the model
class AcademicEvaluator:
    # takes the model and tokenizer
    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer, data: Dict[str, Any]):
        self.model = model
        self.tokenizer = tokenizer
        self.data = data
        self.domain = self.data["domain"]
        self.assignment_prompt = self.data["prompt"]
        self.rubric_text = json.dumps(self.data["rubric"], indent=2)
        self.submissions = self.data["submissions"]

    def detect(self, submission_text: str, few_shot_count: int = 2) -> Dict[str, Any]:
        few_shots = select_balanced_few_shots(self.submissions, max_per_class=1)
        messages = build_detection_prompt(submission_text, few_shots)
        return run_messages(self.model, self.tokenizer, messages, max_new_tokens=300)

    def feedback(self, submission_text: str) -> Dict[str, Any]:
        messages = build_feedback_prompt(self.domain, self.assignment_prompt, self.rubric_text, submission_text)
        # It passes the model and tokenizer to the runner function.
        return run_messages(self.model, self.tokenizer, messages, max_new_tokens=800)

    def rate_feedback(self, submission_text: str, feedback: str) -> float:
        prompt = self_eval_prompt(self.data["rubric"], submission_text, feedback)
        messages = [
            {"role": "system", "content": "You are a careful academic assistant. Reply ONLY with a number."},
            {"role": "user", "content": prompt}
        ]
        result = run_messages(self.model, self.tokenizer, messages, max_new_tokens=10)

        if isinstance(result, dict):
            raw = result.get("raw_output", "")
            return float(re.sub(r"\D", "", raw) or 3)
        elif isinstance(result, str):
            return float(result.strip())
        else:
            return 3.0

class DomainManager:
    # takes the model and tokenizer
    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer, file_paths: List[str]):
        self.model = model
        self.tokenizer = tokenizer
        self.domains_data = {}
        print(f"Loading data from {len(file_paths)} files...")
        for path in file_paths:
            try:
                with open(path, "r", encoding='utf-8') as f:
                    data = json.load(f)
                    domain_name = data.get("domain")
                    if domain_name:
                        self.domains_data[domain_name] = data
                        print(f"  - Loaded domain: {domain_name}")
            except Exception as e:
                print(f"  - ERROR: Failed to load or parse {path}: {e}")

    def list_domains(self) -> List[str]:
        return list(self.domains_data.keys())

    def get_evaluator(self, domain_name: str) -> Optional[AcademicEvaluator]:
        domain_data = self.domains_data.get(domain_name)
        if domain_data:
            # It passes the model and tokenizer when creating the evaluator.
            return AcademicEvaluator(self.model, self.tokenizer, domain_data)
        else:
            print(f"Error: Domain '{domain_name}' not found.")
            return None

### Main Function to Run

IT Domain

In [13]:
if __name__ == "__main__":
    json_files = ["it.json"]

    # loading of model and tokenizer
    manager = DomainManager(model_gemma, tokenizer, json_files)
    print("-" * 20)
    print(f"Available domains: {manager.list_domains()}")
    print("-" * 20)

    # loop through all domains
    for domain_to_evaluate in manager.list_domains():
        print(f"\n=== STARTING EVALUATION FOR '{domain_to_evaluate}' DOMAIN ===")
        evaluator = manager.get_evaluator(domain_to_evaluate)

        if evaluator:
            for idx, submission_data in enumerate(evaluator.submissions):
                submission_text = submission_data["final_submission"]
                ground_truth_label = submission_data["label_type"]

                print(f"\n--- Submission {idx + 1} ---")
                print(f"Ground truth label: {ground_truth_label}")
                print(f"Submission Text: \"{submission_text[:100]}...\"")

                print("\n--- DETECTION ---")
                detection_result = evaluator.detect(submission_text)
                print(json.dumps(detection_result, indent=2, ensure_ascii=False))

                print("\n--- FEEDBACK ---")
                feedback_result = evaluator.feedback(submission_text)
                print(json.dumps(detection_result, indent=2, ensure_ascii=False))

                print("\n--- FEEDBACK RATING ---")
                feedback_str = json.dumps(feedback_result)
                rating = evaluator.rate_feedback(submission_text, feedback_str)
                print(f"Feedback rating (1–5): {rating}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading data from 1 files...
  - Loaded domain: Information Technology
--------------------
Available domains: ['Information Technology']
--------------------

=== STARTING EVALUATION FOR 'Information Technology' DOMAIN ===

--- Submission 1 ---
Ground truth label: AI
Submission Text: "Artificial Intelligence plays a pivotal role in modern cybersecurity by automating threat detection,..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission exhibits a clear human-driven structure with a descriptive overview of AI's role in cybersecurity.",
    "The language is relatively straightforward and lacks the stylistic complexity found in AI-generated text.",
    "The focus is on practical application and the importance of human oversight, suggesting a blend of human and AI contributions.",
    "The repetition of phrases like 'AI plays a pivotal role' is present, indicating a human-written element."
  ],
  "label": "Hybrid"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of AI's role in cybersecurity, particularly its application to threat detection and incident response. The example of machine learning algorithms analyzing data is relevant and provides a clear illustration of AI\u2019s capabilities. However, the response could benefit from deeper critical analysis and a more nuanced discussion of the challenges and limitations of AI-driven security.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The student correctly identifies AI as a key component of modern cybersecurity, citing the automation of threat detection and the potential for reduced false positives.",
        "The example of machine learning analyzing data is relevant and demonstrates the core functionality of AI in cybersecurity."
      ],
      "improvement_tip": "Expand on the specific types of machine learning algorithms used (e.g., anomaly detection

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 4.0

--- Submission 2 ---
Ground truth label: AI
Submission Text: "AI-based cybersecurity solutions such as behavioral analytics and automated intrusion detection syst..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission demonstrates a clear understanding of AI applications in cybersecurity.",
    "The language is relatively concise and focused on the benefits of AI-driven solutions.",
    "The description of automated detection and threat identification is accurate and relevant.",
    "The mention of deep learning and zero-day attacks suggests a sophisticated understanding of the technology's capabilities."
  ],
  "label": "AI"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of AI's role in cybersecurity, particularly in proactive threat detection. The example of behavioral analytics and automated intrusion detection systems is accurate and relevant. The student effectively highlights the speed and effectiveness of AI in combating zero-day attacks, showcasing a good grasp of the topic.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The student correctly identifies AI-powered solutions like behavioral analytics and automated intrusion detection as key advancements in cybersecurity.",
        "The example of phishing email and ransomware signature detection being faster than manual methods is accurate and demonstrates the potential of AI."
      ],
      "improvement_tip": "Consider expanding on the specific types of deep learning models used and their effectiveness in different scenarios."
    },
    {
      "criterion_id

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 3.0

--- Submission 3 ---
Ground truth label: Human
Submission Text: "Last year, my company faced a targeted phishing campaign. At the time, we were testing a new AI-base..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission exhibits a clear human element \u2013 the speaker describes a specific incident and their initial skepticism, demonstrating personal experience and critical thinking.",
    "The language is relatively straightforward and lacks the complex stylistic nuances often found in AI-generated text. There\u2019s a noticeable reliance on descriptive phrases.",
    "The use of phrases like \"odd phrasing patterns and sender behavior that looked normal on the surface\" suggests a human-driven interpretation of the AI\u2019s output, rather than a purely automated analysis.",
    "The concluding statement emphasizes the importance of collaboration between AI and human analysts, a common human perspective."
  ],
  "label": "Hybrid"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of AI's role in cybersecurity, particularly its application in identifying phishing attempts. The student accurately describes a real-world experience and offers a thoughtful perspective on the evolving relationship between AI and cybersecurity professionals.  The response is well-written and demonstrates a clear grasp of the topic.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The student correctly identifies AI as a tool for enhancing cybersecurity analysis, stating it 'doesn't replace human analysts'.",
        "The example of flagging emails missed by the new filter illustrates the AI's ability to detect subtle anomalies.",
        "The student mentions the importance of AI in identifying unusual patterns, demonstrating an understanding of its function."
      ],
      "improvement_tip": "Expand on the specific types of anomalies the AI flagged.

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 4.0

--- Submission 4 ---
Ground truth label: Human
Submission Text: "As a junior security analyst, I used to spend hours going through alerts manually. Once we integrate..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission exhibits a clear human element, describing a personal experience and a specific, tangible benefit (time savings).",
    "The language is relatively straightforward and lacks overly complex phrasing or jargon.",
    "The description of the AI's adaptability is presented as a key feature, suggesting a human-driven understanding of the system's capabilities.",
    "The overall tone is conversational and reflective, indicating a personal perspective rather than a purely analytical one."
  ],
  "label": "Hybrid"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of AI's role in cybersecurity, particularly its application in automating alert triage. The student accurately describes a positive impact and highlights the tool's adaptive learning capabilities, showcasing a practical understanding of the technology's potential.  The response is well-written and relevant to the prompt.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The student mentions 'cutting our alert triage time by more than half,' indicating a measurable improvement. This is a concrete example of the AI's effectiveness.",
        "The statement 'the system kept learning \u2014 it got better at distinguishing real threats from noise' demonstrates an understanding of AI's learning process and its ability to adapt to new threats."
      ],
      "improvement_tip": "Consider expanding on the specific types of threats the AI is learning to identify

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 3.0

--- Submission 5 ---
Ground truth label: Hybrid
Submission Text: "We used AI to monitor user activity across our cloud environment. While the algorithm caught some ab..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission exhibits a clear blend of human and AI elements.",
    "The description of the AI's role (catching abnormal logins) is presented with a human-like narrative of learning and refinement.",
    "The statement about 'collaboration' is a human-driven observation, suggesting a human perspective on the process.",
    "The repetition of phrases like 'AI brought the speed' and 'human expertise brought the accuracy' indicates a human-written style."
  ],
  "label": "Hybrid"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of AI's role in cybersecurity, particularly in monitoring user activity and the challenges of AI bias. The student correctly identifies AI's benefits (speed and accuracy) while acknowledging the need for human oversight and refinement. The example illustrates a practical application and a crucial point about collaboration between AI and human expertise.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The student correctly states AI's use for monitoring user activity and its detection of anomalous logins.",
        "The statement about raising false alarms and needing to adjust the model highlights the importance of AI training and feedback loops, demonstrating an understanding of the iterative process."
      ],
      "improvement_tip": "Consider expanding on the specific techniques used to refine the AI model and the rationale behind the feedback loo

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 4.0

--- Submission 6 ---
Ground truth label: Hybrid
Submission Text: "We recently ran a red team-blue team exercise. Our AI tools were excellent at identifying command-an..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission demonstrates a clear human element \u2013 the description of the red team/blue team exercise and the role of AI as a tool is explicitly stated.",
    "The use of phrases like \"subtle signs\" and \"balance between automation and intuition\" suggests a human-driven approach, indicating a hybrid model.",
    "The emphasis on the importance of human expertise alongside AI is highlighted, reinforcing the hybrid nature of the activity.",
    "The overall tone and phrasing suggest a thoughtful consideration of the interplay between AI and human capabilities."
  ],
  "label": "Hybrid"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of AI's role in cybersecurity, particularly its effectiveness in identifying command-and-control traffic. The student accurately highlights the value of AI as a tool to augment human analysts, acknowledging its limitations. However, the response lacks a deeper exploration of the challenges and complexities inherent in lateral movement, and the communication could be more polished.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "\u2018Our AI tools were excellent at identifying command-and-control traffic,\u2019 - This directly states the AI's positive contribution.",
        "\u2018The AI was a force multiplier, not a silver bullet.\u2019 - This acknowledges the AI's limitations and the need for human expertise."
      ],
      "improvement_tip": "Expand on the specific types of lateral movement the AI struggled with. Consider adding a brief discussion

### Engineering Domain

In [63]:
if __name__ == "__main__":
    json_files = ["engineering.json"]

    # The manager is created with the loaded model and tokenizer.
    manager = DomainManager(model_gemma, tokenizer, json_files)
    print("-" * 20)
    print(f"Available domains: {manager.list_domains()}")
    print("-" * 20)

    # Loop through all domains
    for domain_to_evaluate in manager.list_domains():
        print(f"\n=== STARTING EVALUATION FOR '{domain_to_evaluate}' DOMAIN ===")
        evaluator = manager.get_evaluator(domain_to_evaluate)

        if evaluator:
            for idx, submission_data in enumerate(evaluator.submissions):
                submission_text = submission_data["final_submission"]
                ground_truth_label = submission_data["label_type"]

                print(f"\n--- Submission {idx + 1} ---")
                print(f"Ground truth label: {ground_truth_label}")
                print(f"Submission Text: \"{submission_text[:100]}...\"")

                print("\n--- DETECTION ---")
                detection_result = evaluator.detect(submission_text)
                print(json.dumps(detection_result, indent=2, ensure_ascii=False))

                print("\n--- FEEDBACK ---")
                feedback_result = evaluator.feedback(submission_text)
                print(json.dumps(feedback_result, indent=2, ensure_ascii=False))

                print("\n--- FEEDBACK RATING ---")
                feedback_str = json.dumps(feedback_result)
                rating = evaluator.rate_feedback(submission_text, feedback_str)
                print(f"Feedback rating (1–5): {rating}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading data from 1 files...
  - Loaded domain: Manufacturing Engineering
--------------------
Available domains: ['Manufacturing Engineering']
--------------------

=== STARTING EVALUATION FOR 'Manufacturing Engineering' DOMAIN ===

--- Submission 1 ---
Ground truth label: AI
Submission Text: "To set up a basic production line for a new consumer product, a manufacturing engineer follows a str..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission demonstrates a clear blend of human and AI elements.",
    "The description of the process includes detailed, human-centric elements like collaboration with product designers and consideration of ergonomic factors, suggesting a human input.",
    "The mention of tools like process flow diagrams and value stream maps indicates a human-driven approach to workflow mapping, though the specific implementation is not explicitly stated.",
    "The inclusion of safety considerations and proactive hazard analysis demonstrates a human focus on risk management, aligning with the need for human oversight."
  ],
  "label": "Hybrid"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of the manufacturing process for a new product line. The response clearly outlines the key phases – design, equipment selection, layout, efficiency considerations, and safety – and provides examples of how these elements are integrated. The student’s explanation is generally clear and well-structured, demonstrating a good grasp of manufacturing principles.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The engineer explicitly mentions 'mapping out the entire production workflow' and 'value stream maps,' demonstrating an understanding of process analysis.",
        "The use of 'CAD software' for layout optimization indicates a focus on efficiency and minimizing material handling, aligning with the 'layout optimized' statement."
      ],
      "improvement_tip": "Expand on the specific CAD software used and how it was utilized to optimize the layout – 

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 4.0

--- Submission 2 ---
Ground truth label: AI
Submission Text: "A manufacturing engineer sets up a production line through a step-by-step approach that begins with ..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission demonstrates a clear understanding of the production process, incorporating design analysis, workflow mapping, equipment selection, and operational efficiency metrics.",
    "The use of digital modeling and simulation suggests a significant level of AI assistance in the design phase, indicating a hybrid approach.",
    "The emphasis on safety protocols and regulatory compliance demonstrates a commitment to responsible manufacturing practices, suggesting an AI-driven safety assessment and monitoring system.",
    "The iterative improvement process, driven by pilot runs and data analysis, highlights a human-AI collaboration focused on continuous optimization."
  ]
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of the manufacturing process, outlining a production line setup from initial design to final assembly. The student effectively incorporates key elements like equipment selection, layout optimization, efficiency considerations, and safety protocols. The use of digital modeling and simulation is a strength, and the iterative improvement approach reflects a practical engineering mindset. However, the response could benefit from more detailed examples of specific equipment choices and a more robust discussion of potential challenges and mitigation strategies.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The engineer identifies the product's critical dimensions and required materials, indicating a foundational understanding of design requirements.",
        "The description of the workflow, including material handling and packaging, demonstrates a clear

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 4.0

--- Submission 3 ---
Ground truth label: Human
Submission Text: "When setting up a production line, a manufacturing engineer starts by looking at the product's desig..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission demonstrates a clear blend of human-driven planning and AI-assisted execution.",
    "The description of the engineer's process includes elements of design understanding, workflow mapping, equipment selection, and safety considerations, suggesting human involvement.",
    "The mention of 'rough plan' and 'test run' indicates a human element in the initial stages, while the subsequent adjustments and problem-solving suggest an AI-supported iterative process.",
    "The use of 'guard rails, signs, and training' points to a human element focused on operational safety and worker well-being, which is a key aspect of human oversight."
  ]
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of the manufacturing process, outlining the key steps from design to production. The response clearly describes the engineer's considerations for layout, efficiency, and safety, and provides a reasonable example of a production line setup. The student demonstrates a good grasp of the core principles involved.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The engineer identifies the need for parts, their arrangement, and required machines, demonstrating a foundational understanding of product design.",
        "The description of a rough production line plan, including machine placement and workflow, shows an awareness of operational efficiency.",
        "The inclusion of safety measures like guardrails and signage indicates a commitment to worker safety."
      ],
      "improvement_tip": "Consider adding a brief discussion of how the engineer woul

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 4.0

--- Submission 4 ---
Ground truth label: Human
Submission Text: "To set up a new production line, a manufacturing engineer first learns about the product they are go..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission demonstrates a clear blend of human understanding of product design and technical planning.",
    "The engineer actively engages in collaborative design discussions and utilizes tools to optimize workflow, suggesting a human element.",
    "The description of worker safety measures and the iterative testing phase indicate a human-driven approach to process refinement.",
    "The use of 'learning about the product' and 'talking with designers' points to a human-centered approach, suggesting a focus on understanding the product's needs."
  ],
  "label": "Hybrid"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of the manufacturing process, outlining the key steps from initial design to product assembly. The student correctly identifies the importance of equipment layout, efficiency considerations, and safety protocols. The description is clear and provides a reasonable overview of the setup process.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The engineer explains the process of breaking down the work into steps, matching each step with a tool or machine, and ensuring workers don't have to walk long distances or lift heavy items.",
        "The student mentions the importance of safety measures, including emergency stops, signs, and barriers."
      ],
      "improvement_tip": "Consider adding a brief explanation of how the layout minimizes travel distance and reduces the need for repetitive tasks."
    },
    {
      "criterion_id": "c2",
      "rating

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 4.0

--- Submission 5 ---
Ground truth label: Hybrid
Submission Text: "The process of setting up a production line involves both technical planning and practical foresight..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission demonstrates a clear blend of technical planning and practical foresight, reflecting a hybrid approach.",
    "The description utilizes tools like Gantt charts and process maps, indicating a technical component, but also incorporates practical considerations like minimizing non-value-adding steps.",
    "The mention of cellular manufacturing and buffer zones suggests an understanding of lean principles, a hallmark of hybrid approaches.",
    "The inclusion of safety integration throughout the process demonstrates a commitment to proactive safety measures, further supporting the hybrid model."
  ],
  "label": "Hybrid"
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of the manufacturing process, outlining key steps from design to assembly. The response effectively incorporates equipment layout, efficiency considerations, and safety protocols. The student’s explanation is generally clear and well-structured, with relevant examples provided. However, the analysis of depth and specific examples could be strengthened.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The engineer mentions reviewing the product design to determine assembly stages and material requirements, which is a crucial initial step.",
        "The use of Gantt charts or process maps demonstrates an understanding of outlining the steps involved in production.",
        "The selection of equipment based on precision needs, volume, and automation levels is a key consideration, showing an awareness of balancing cost and flexibility."
      ],
      "i

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Feedback rating (1–5): 4.0

--- Submission 6 ---
Ground truth label: Hybrid
Submission Text: "Setting up a basic production line combines detailed technical design with real-world testing and ad..."

--- DETECTION ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "analysis": [
    "The submission demonstrates a strong blend of technical planning and practical adaptation, reflecting a hybrid approach.",
    "The description of the process includes both detailed technical design (sequence outlining, machine selection) and real-world testing and adaptation (trial run, feedback loops).",
    "The mention of takt time, bottlenecks, simulation software, and ergonomic considerations indicates a focus on both efficiency and worker safety, aligning with hybrid principles.",
    "The inclusion of a limited production run for testing highlights a pragmatic approach to continuous improvement, characteristic of hybrid systems."
  ]
}

--- FEEDBACK ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "overall_summary": "The student demonstrates a solid understanding of the manufacturing process, outlining the key steps from design to production. The student effectively incorporates equipment selection, layout optimization, and safety considerations, demonstrating a practical approach to production line setup. However, the response could benefit from more detailed technical explanations and a stronger emphasis on the iterative nature of the design process, particularly regarding identifying and mitigating potential issues during testing.",
  "criteria_feedback": [
    {
      "criterion_id": "c1",
      "rating": "good",
      "evidence": [
        "The engineer identifies key operations like cutting, fastening, and inspection, demonstrating a foundational understanding of production flow.",
        "The use of takt time and simulation software indicates an awareness of efficiency optimization techniques.",
        "The inclusion of emergency stops and clear pathways highlights 

### Gen AI Rating
Gemma Model Evaluation

- Feedback Generation Quality: 4/5
Gemma generates structured, rubric-style feedback with clear criteria and detailed points. It produces professional and academic-style responses, which makes it useful in evaluation tasks. However, it often repeats phrasing (e.g., “The submission demonstrates a solid understanding...”) and lacks personalization, so feedback can feel generic across different inputs.

- AI Detection Accuracy: 1/5
Gemma shows a strong bias when labeling human vs. AI vs hybrid submissions. It tends to misclassify, often leaning too heavily toward one category. The detection is unreliable and cannot be used in real-world classification tasks.

### Human Evaluation

##### 1. Feedback Generation
- Clarity & Structure: Responses are rubric-style, structured, and well-aligned with evaluation criteria.
- Weakness: Repetitive language, limited personalization, identical phrases across multiple submissions.

Score: 4/5
Reasoning: Clear, structured, but lacks originality and personalization.

##### 2. AI Detection
- Binary Classification Performance: Frequently misclassifies. Strong bias and tends to over-predict hybrid 
AI-generated and human structured content are labeled incorrectly. And accuracy is very low (close to chance).

Score: 1/5
Reasoning: Detection is unreliable and unusable for real-world use.


#### Submission based feedback
##### 1. Submission

- True Classification: AI
- Model Prediction: hybrid
- Evaluation Result: Wrong
- Confidence Level: High
-Key Observations: Structured, polished writing with formal tone. Gemma misclassified as hybrid due to generic fluency.
- Rater: Umar Khayam
- Date: 04/09/2025

##### 2. Submission

- True Classification: AI
- Model Prediction: AI
- Evaluation Result: Correct
- Confidence Level: High
- Key Observations: Technical termenology and sturctured writing. 
- Rater: Umar Khayam
- Date: 04/09/2025

##### 3. Submission

- True Classification: Human
- Model Prediction: Hybrid
- Evaluation Result: Wrong
- Confidence Level: Hybrid
- Key Observations: Natural imperfections misclassified as Hybrid.
- Rater: Umar Khayam
- Date: 04/09/2025

##### 4. Submission

- True Classification: Human
- Model Prediction: Hybrid
- Evaluation Result: Wrong
- Confidence Level: High
- Key Observations: Describes lived experience as junior analyst. Imperfect flow, but Gemma misclassified.
- Rater: Umar Khayam
- Date: 04/09/2025

##### 5. Submission

- True Classification: Hybrid
- Model Prediction: Hybrid
- Evaluation Result: correct
- Confidence Level: Medium
- Key Observations: Mix of AI-like structure and human  reflection. Over-classified as hybrid, missing the nuance.
- Rater: Umar Khayam
- Date: 04/09/2025

##### 6. Submission

- True Classification: Hybrid
- Model Prediction: Hybrid
- Evaluation Result: correct
- Confidence Level: Medium
- Key Observations: Balanced account of AI + human collaboration in red/blue team context.
- Rater: Umar Khayam
- Date: 04/09/2025