In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
trained_model = "./gemma3-phase1"
tokenizer = AutoTokenizer.from_pretrained(trained_model)
model = AutoModelForCausalLM.from_pretrained(trained_model)
model.eval()

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=1152, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1152, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=1024, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=1152, out_features=256, bias=False)
          (v_proj): lora.Linear(
            (base_layer): Li

In [3]:
# ====== Load dataset ======
def load_partition(path: str) -> Dataset:
    df = pd.read_csv(path).head(10)
    return Dataset.from_pandas(df)

dataset = load_partition("./Student_Training_Data/GPT.csv")

print(f"Loaded {len(dataset)} samples from dataset.")

Loaded 10 samples from dataset.


In [4]:
import json

predictions = []

for example in dataset:
    prompt = (
        "<instruction>Classify the following scientific text as one of [background, method, result].\n\n"
        f"Text: {example['string']}\n"
        "Provide your classification and reasoning in JSON format.</instruction>"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False,
            num_beams=1
        )

    generated = tokenizer.decode(output[0], skip_special_tokens=True)

    # generated output will be in this format: <response>{"classification": "method", "reasoning": "It describes the method used..."}
    if "<response>" in generated:
        json_start = generated.split("<response>")[-1].strip()
        try:
            parsed = json.loads(json_start)
            classification = parsed.get("classification", "")
            reasoning = parsed.get("reasoning", "")
        except json.JSONDecodeError:
            classification = "parse_error"
            reasoning = "Could not parse output."
    else:
        classification = "no_response"
        reasoning = "Missing <response> tag."

    predictions.append({
        "predicted_classification": classification,
        "predicted_reasoning": reasoning
    })
    print(f"Generated: {generated} \n Predicted: {classification}")

# ====== Save to CSV ======
df = dataset.to_pandas()
df["predicted_classification"] = [p["predicted_classification"] for p in predictions]
df["predicted_reasoning"] = [p["predicted_reasoning"] for p in predictions]
df.to_csv("gemma3_classification_results.csv", index=False)



Generated: <instruction>Classify the following scientific text as one of [background, method, result].

Text: However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).
Provide your classification and reasoning in JSON format.</instruction>

**JSON Output:**

```json
{
  "classification": "method",
  "reasoning": "The text describes an investigation into the interaction between frataxin and Fe-S cluster biosynthesis components, which is a core aspect of a scientific method."
}
```
 
 Predicted: no_response
Generated: <instruction>Classify the following scientific text as one of [background, method, result].

Text: In the study by Hickey et al. (2012), spikes were sampled from the field at the point of physiological robinson et al.: genomic regions influencing root traits in barley 11 of 13 maturity, dried, grain threshed by h

In [None]:
# Perform inference step here
def generate_output(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

predictions = []

for index, row in dataset.iterrows():
    text_input = row["string"]
    predicted_label = generate_output(text_input)
    predictions.append(predicted_label)
    print("classification: ", predicted_label)

# Save to DataFrame
dataset["predicted_classification"] = predictions
dataset.to_csv("gemma3_classification_results.csv", index=False)
print("Inference for first 10 data points completed.")
# Save results
# dataset = dataset.add_column("generated_reasoning", predictions)
# output_csv_path = "./llama-student-phase2.csv"
# dataset.to_pandas().to_csv(output_csv_path, index=False)

# print(f"Inference completed. Results saved to {output_csv_path}")

In [None]:
for i, prediction in enumerate(predictions):
    print(f"Index {i}: {prediction} \n")