Get pre-trained model from: https://osf.io/ytesn/
filename: gibbscycle.zip

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Testing


Change to try out the model


```
# Example inputs
sample_texts = [
    "I just don't know how exactly one would react in the real situation.",
    "However  I found the practical second sub-task particularly fun  where you could come up with a concept for future lesson planning."
]
```



In [18]:
import torch
import numpy as np
from transformers import ElectraForSequenceClassification, ElectraTokenizer
from scipy.special import expit  # Sigmoid function

# Define model path
model_path = "/content/drive/MyDrive/Work/reflective-writing/gibbscycle"

# Load tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained(model_path)
model = ElectraForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define label mapping from config
id2label = {
    0: "Description",
    1: "Feelings",
    2: "Evaluation",
    3: "Analysis",
    4: "Conclusion",
    5: "Action_Plan"
}

# Function to classify input text and return the most confident label
def classify_text(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Run inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits
    logits = outputs.logits

    # Convert logits to probabilities
    probabilities = expit(logits.cpu().numpy())

    # Get the label with the highest probability
    predicted_index = np.argmax(probabilities[0])
    predicted_label = id2label[predicted_index]

    # Print formatted results
    print("\n=== Model Prediction ===")
    print(f"Text: {text}")
    print(f"Logits: {logits.cpu().numpy()}")
    print(f"Probabilities: {probabilities}")
    print(f"Predicted Label: {predicted_label}\n")

    return predicted_label


# Example inputs
sample_texts = [
    "I just don't know how exactly one would react in the real situation.",
    "However  I found the practical second sub-task particularly fun  where you could come up with a concept for future lesson planning."
]

# Classify each text
for text in sample_texts:
    classify_text(text)


=== Model Prediction ===
Text: I just don't know how exactly one would react in the real situation.
Logits: [[ 4.1373234  -0.73890877  0.07697752 -1.1037709  -1.2404493  -1.6696248 ]]
Probabilities: [[0.9842853  0.3232428  0.5192349  0.24903403 0.22435777 0.1584742 ]]
Predicted Label: Description


=== Model Prediction ===
Text: However  I found the practical second sub-task particularly fun  where you could come up with a concept for future lesson planning.
Logits: [[ 0.48400927  0.69441974  2.6320114  -0.7879639  -0.5637402  -2.2761202 ]]
Probabilities: [[0.6186941  0.6669494  0.9328936  0.312606   0.3626825  0.09312008]]
Predicted Label: Evaluation



# Output predictions for PapagAI test set

change `model_path` and `input_file`, `output_file`

In [14]:
import json
import torch
import numpy as np
from transformers import ElectraForSequenceClassification, ElectraTokenizer
from scipy.special import expit  # Sigmoid function

# Define model path
model_path = "/content/drive/MyDrive/Work/reflective-writing/gibbscycle"

# Load tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained(model_path)
model = ElectraForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to classify text and return only the numeric prediction
def classify_text(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Run inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits
    logits = outputs.logits

    # Convert logits to probabilities
    probabilities = expit(logits.cpu().numpy())

    # Get the label with the highest probability
    predicted_index = np.argmax(probabilities[0])

    return str(predicted_index)  # Convert to string for JSON output

# File paths
input_file = "/content/drive/MyDrive/Work/reflective-writing/spacy/test_data.jsonl"   # Change this to your actual JSONL file
output_file = "/content/drive/MyDrive/Work/reflective-writing/spacy/data/output/elektra_test_predictions.jsonl"  # Output file with predictions

# Read input file, classify each entry, and save output
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        entry = json.loads(line.strip())  # Read JSON line
        text = entry["text"]  # Extract input text
        prediction = classify_text(text)  # Get model prediction

        # Add prediction as "predicted"
        entry["predicted"] = prediction

        # Write updated entry to output file
        outfile.write(json.dumps(entry) + "\n")

print(f"Predictions saved to {output_file}")


Predictions saved to /content/drive/MyDrive/Work/reflective-writing/spacy/data/output/elektra_test_predictions.jsonl


# Output predictions for Reflect data - 50 samples

change `model_path` and `input_file`, `output_file`

In [19]:
import json
import torch
import numpy as np
import csv
from transformers import ElectraForSequenceClassification, ElectraTokenizer
from scipy.special import expit  # Sigmoid function

# Define model path
model_path = "/content/drive/MyDrive/Work/reflective-writing/gibbscycle"

# Load tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained(model_path)
model = ElectraForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to classify text and return only the numeric prediction
def classify_text(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Run inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits
    logits = outputs.logits

    # Convert logits to probabilities
    probabilities = expit(logits.cpu().numpy())

    # Get the label with the highest probability
    predicted_index = np.argmax(probabilities[0])

    return str(predicted_index)  # Convert to string for TSV output

# File paths
input_file = "/content/drive/MyDrive/Work/reflective-writing/spacy/data/bfh/segmented_reflections.jsonl"   # Change this to your actual JSONL file
output_file = "/content/drive/MyDrive/Work/reflective-writing/spacy/data/output/bfh_elektra_predictions.tsv"  # Output file in TSV format

# Read input file, classify each entry, and save output as TSV
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8", newline="") as outfile:
    tsv_writer = csv.writer(outfile, delimiter="\t")

    # Write header row
    tsv_writer.writerow(["text_id", "text", "label"])

    for line in infile:
        entry = json.loads(line.strip())  # Read JSON line
        text_id = entry["text_id"]  # Extract text ID
        text = entry["text"]  # Extract input text
        prediction = classify_text(text)  # Get model prediction

        # Write to TSV file
        tsv_writer.writerow([text_id, text, prediction])

print(f"Predictions saved to {output_file}")


Predictions saved to /content/drive/MyDrive/Work/reflective-writing/spacy/data/output/bfh_elektra_predictions.tsv
