In [None]:
!pip install datasets
import nltk
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset

# Download NLTK tokenizer data
nltk.download('punkt')

# Function to convert sentence and entities to BIO-tagged format
def convert_to_tagged_sentence(sentence, entities):
    tokens = nltk.word_tokenize(sentence)
    tags = ["O"] * len(tokens)
    for entity in entities:
        start = entity["start"]
        end = entity["end"]
        label = entity["label"].replace(" ", "_").replace("/", "_")
        tags[start] = f"B-{label}"
        for i in range(start + 1, end):
            tags[i] = f"I-{label}"
    return " ".join([f"{token}/{tag}" for token, tag in zip(tokens, tags)])

# Initial examples (15 from previous setup)
initial_examples = [
    {"input_text": "The child was delighted with the surprise party.",
     "entities": [{"text": "delighted", "start": 3, "end": 4, "label": "Affect - Un/happiness"}]},
    {"input_text": "She felt anxious about the storm approaching.",
     "entities": [{"text": "anxious", "start": 2, "end": 3, "label": "Affect - In/Security"}]},
    {"input_text": "He was miserable after losing the game.",
     "entities": [{"text": "miserable", "start": 2, "end": 3, "label": "Affect - Dis/satisfaction"}]},
    {"input_text": "The audience applauded enthusiastically at the end of the play.",
     "entities": [{"text": "enthusiastically", "start": 3, "end": 4, "label": "Affect - Realis"}]},
    {"input_text": "She plans to travel the world someday.",
     "entities": [{"text": "plans", "start": 1, "end": 2, "label": "Affect - Irrealis"}]},
    {"input_text": "He was regarded as extraordinarily fortunate in his career.",
     "entities": [{"text": "extraordinarily fortunate", "start": 4, "end": 6, "label": "Judgement - Social Esteem (Normality)"}]},
    {"input_text": "Her vast experience made her highly capable in crises.",
     "entities": [{"text": "highly capable", "start": 5, "end": 7, "label": "Judgement - Social Esteem (Capacity)"}]},
    {"input_text": "Despite setbacks, he remained tireless in his efforts.",
     "entities": [{"text": "tireless", "start": 4, "end": 5, "label": "Judgement - Social Esteem (Tenacity)"}]},
    {"input_text": "Her candid confession earned everyone’s respect.",
     "entities": [{"text": "candid", "start": 1, "end": 2, "label": "Judgement - Social Sanction (Veracity)"}]},
    {"input_text": "His charitable acts exemplified true generosity.",
     "entities": [{"text": "true generosity", "start": 4, "end": 6, "label": "Judgement - Social Sanction (Propriety)"}]},
    {"input_text": "The fireworks display was breathtakingly intense.",
     "entities": [{"text": "breathtakingly intense", "start": 4, "end": 6, "label": "Appreciation - Reaction (Impact)"}]},
    {"input_text": "The garden’s fragrance was utterly enchanting.",
     "entities": [{"text": "utterly enchanting", "start": 4, "end": 6, "label": "Appreciation - Reaction (Quality)"}]},
    {"input_text": "The architecture had a perfectly balanced symmetry.",
     "entities": [{"text": "perfectly balanced", "start": 4, "end": 6, "label": "Appreciation - Composition (Balance)"}]},
    {"input_text": "The novel’s plot was intricate yet elegantly simple.",
     "entities": [{"text": "intricate yet elegantly simple", "start": 3, "end": 7, "label": "Appreciation - Composition (Complexity)"}]},
    {"input_text": "The artist’s style was truly inimitable.",
     "entities": [{"text": "truly inimitable", "start": 4, "end": 6, "label": "Appreciation - Valuation"}]},
]

# New examples with annotations (partial list for brevity)
new_examples = [
    {"input_text": "The child felt cheerful after receiving a surprise gift.",
     "entities": [{"text": "cheerful", "start": 3, "end": 4, "label": "Affect - Un/happiness"}]},
    {"input_text": "She was anxious about the upcoming medical test.",
     "entities": [{"text": "anxious", "start": 2, "end": 3, "label": "Affect - In/Security"}]},
    {"input_text": "He grew miserable after his proposal was rejected.",
     "entities": [{"text": "miserable", "start": 2, "end": 3, "label": "Affect - Dis/satisfaction"}]},
    {"input_text": "The captain wept when he heard the tragic news.",
     "entities": [{"text": "wept", "start": 2, "end": 3, "label": "Affect - Realis"}]},
    {"input_text": "She plans to travel the world once she saves enough money.",
     "entities": [{"text": "plans", "start": 1, "end": 2, "label": "Affect - Irrealis"}]},
    {"input_text": "The artist’s unconventional style made her stand out in the exhibition.",
     "entities": [{"text": "unconventional", "start": 2, "end": 3, "label": "Judgement - Social Esteem (Normality)"}]},
    {"input_text": "His experienced leadership guided the team through the crisis.",
     "entities": [{"text": "experienced", "start": 1, "end": 2, "label": "Judgement - Social Esteem (Capacity)"}]},
    {"input_text": "Despite setbacks, she remained resolute in finishing the project.",
     "entities": [{"text": "resolute", "start": 5, "end": 6, "label": "Judgement - Social Esteem (Tenacity)"}]},
    {"input_text": "The politician’s deceptive statement angered the public.",
     "entities": [{"text": "deceptive", "start": 2, "end": 3, "label": "Judgement - Social Sanction (Veracity)"}]},
    {"input_text": "Their charitable donations to the homeless shelter were widely praised.",
     "entities": [{"text": "charitable", "start": 1, "end": 2, "label": "Judgement - Social Sanction (Propriety)"}]},
    {"input_text": "The movie’s intense climax left the audience speechless.",
     "entities": [{"text": "intense", "start": 2, "end": 3, "label": "Appreciation - Reaction (Impact)"}]},
    {"input_text": "The enchanting melody of the song brought tears to her eyes.",
     "entities": [{"text": "enchanting", "start": 1, "end": 2, "label": "Appreciation - Reaction (Quality)"}]},
    {"input_text": "The garden’s symmetrical layout created a harmonious atmosphere.",
     "entities": [{"text": "symmetrical", "start": 2, "end": 3, "label": "Appreciation - Composition (Balance)"}]},
    {"input_text": "The byzantine plot of the novel confused even seasoned readers.",
     "entities": [{"text": "byzantine", "start": 1, "end": 2, "label": "Appreciation - Composition (Complexity)"}]},
    {"input_text": "Her unique painting style was celebrated as groundbreaking.",
     "entities": [{"text": "groundbreaking", "start": 7, "end": 8, "label": "Appreciation - Valuation"}]},
    {"input_text": "The crowd cheered wildly when the team scored.",
     "entities": [{"text": "cheered wildly", "start": 2, "end": 4, "label": "Affect - Behavioural surge"}]},
    {"input_text": "He felt a vague sense of ennui on the rainy afternoon.",
     "entities": [{"text": "ennui", "start": 6, "end": 7, "label": "Affect - Undirected mood"}]},
    {"input_text": "The witness’s frank testimony clarified the case.",
     "entities": [{"text": "frank", "start": 2, "end": 3, "label": "Judgement - Social Sanction (Veracity)"}]},
    {"input_text": "The altruistic effort to rebuild the village was deeply meaningful.",
     "entities": [{"text": "meaningful", "start": 9, "end": 10, "label": "Appreciation - Valuation"}]},
    {"input_text": "The audience was charmed by the pianist’s delicate performance.",
     "entities": [{"text": "charmed", "start": 3, "end": 4, "label": "Affect - Un/happiness"}]},
    {"input_text": "After the argument, he sank into a melancholy silence.",
     "entities": [{"text": "melancholy", "start": 8, "end": 9, "label": "Affect - Un/happiness"}]},
    # Add remaining examples as needed
]

# Combine all examples
all_examples = initial_examples + new_examples

# Convert to dataset format
data = [{"input_text": ex["input_text"], "target_text": convert_to_tagged_sentence(ex["input_text"], ex["entities"])} for ex in all_examples]
dataset = Dataset.from_list(data)

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda")

# Tokenize input and target texts
def tokenize_function(examples):
    input_encodings = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=512)
    target_encodings = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=512)
    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=5e-8,
    per_device_train_batch_size=8,
    num_train_epochs=10,  # Increased epochs for better learning
    weight_decay=0.01,
)

# Initialize and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

# Prediction function
def predict(sentence):
    input_ids = tokenizer.encode(sentence, return_tensors="pt").to(model.device)
    output = model.generate(input_ids, max_length=512)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Parse tagged output to extract entities
def parse_tagged_sentence(tagged_sentence):
    parts = tagged_sentence.split()
    entities = []
    current_entity = []
    current_label = None
    for part in parts:
        try:
            token, tag = part.rsplit("/", 1)
        except ValueError:
            continue
        if tag.startswith("B-"):
            if current_entity:
                entities.append({"text": " ".join(current_entity), "label": current_label.replace("_", " ")})
            current_entity = [token]
            current_label = tag[2:]
        elif tag.startswith("I-") and current_label == tag[2:]:
            current_entity.append(token)
        else:
            if current_entity:
                entities.append({"text": " ".join(current_entity), "label": current_label.replace("_", " ")})
            current_entity = []
            current_label = None
    if current_entity:
        entities.append({"text": " ".join(current_entity), "label": current_label.replace("_", " ")})
    return entities

# Test the model
test_sentences = [
    "The fireworks display was breathtakingly intense.",
    "She felt cheerful after winning the award.",
    "His deceptive tactics upset the team."
]
for sentence in test_sentences:
    predicted_tags = predict(sentence)
    entities = parse_tagged_sentence(predicted_tags)
    print(f"Sentence: {sentence}")
    print(f"Predicted entities: {entities}\n")