In [5]:
import csv

# Initialize lists to store input_texts and target_texts
input_texts = []
target_texts = []

# Open and read the TSV file
with open("HomographData/train/winds.tsv", "r", encoding="utf-8") as file:
    reader = csv.DictReader(file, delimiter="\t")
    for row in reader:
        # Construct input_text and target_text
        startpos = int(row["start"])
        endpos = int(row["end"])
        input_text = row["sentence"][:startpos] + "<" + row["homograph"] + ">" + row["sentence"][endpos:]
        target_text = row["wordid"]

        # Add to respective lists
        input_texts.append(input_text)
        target_texts.append(target_text)

# Example output
print("Input Texts:", input_texts[:7])  # Display first 2 inputs
print("Target Texts:", target_texts[:7])  # Display first 2 targets


Input Texts: ['Three other ships reported tropical storm-force <winds>.', 'Showers and storms can also develop along stationary fronts, and <winds> move them down the front.', 'The strongest <winds> hit Wales and Northwest English coasts.', 'Throughout southwestern Puerto Rico, the storm brought moderate rains, strong <winds>, and rough seas.', 'After attaining peak <winds> of 125 mph (205 km/h), the effects of cooler ocean temperatures began to weaken the system.', 'Hurricane Sandy had tropical storm-force <winds> when it reached Connecticut October 29, 2012, with four deaths blamed on the storm.', 'The R3Y set a transcontinental seaplane record of 403 mph in 1954 by utilizing the speed of high-altitude jetstream <winds>.']
Target Texts: ['winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou']


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5ForSequenceClassification

# Load the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
print('T5 tokenizer loaded')

# print('nCategories ' , nCategories)
# model = T5ForSequenceClassification.from_pretrained("t5-small" , num_labels = nCategories )
# model = T5ForSequenceClassification.from_pretrained("t5-small" , num_labels=2)

model = T5ForConditionalGeneration.from_pretrained("t5-small")
print('T5 model loaded')

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5 tokenizer loaded
T5 model loaded


In [None]:
import torch
from torch.optim import AdamW
from sklearn.metrics import accuracy_score



# batch = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
model_inputs = tokenizer(input_texts, max_length=64, truncation=True, padding="max_length")
labels = tokenizer(target_texts, max_length=8, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]

maxnrtokens = 0
nrtexts = 0
# Get the number of tokens for each sentence
for sentence in labels:
    tokens = tokenizer.tokenize(sentence)  # Tokenize the sentence

    nrtexts += 1
    if nrtexts < 5:
        print(f"Sentence: {sentence}")
        print(f"Number of Tokens: {len(tokens)}")
        print(f"Tokens: {tokens}")
        print("-" * 30)

    nrtokens = len(tokens)

    if nrtokens > maxnrtokens:
        maxnrtokens = nrtokens

print('maxnrtokens' , maxnrtokens)

# Perform inference
print('untrained')

# Example inference
def predict_wordid(misspelled_name):
    input_ids = tokenizer(misspelled_name, return_tensors="pt", max_length=32, truncation=True).input_ids
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the fine-tuned model
sentence = input_texts[10]  # Example input
wordid = predict_wordid(sentence)
print(f"sentence: {sentence}, predicted wordid: {wordid}")



Sentence: input_ids
Number of Tokens: 5
Tokens: ['▁input', '_', 'i', 'd', 's']
------------------------------
Sentence: attention_mask
Number of Tokens: 4
Tokens: ['▁attention', '_', 'mas', 'k']
------------------------------
maxnrtokens 5
untrained
sentence: The strongest cyclone of the season, the third hurricane, peaked at Category 3 strength with 125 mph (205 km/h) <winds>., predicted wordid: 


In [15]:
from datasets import Dataset

# Define a small dataset (replace this with your real dataset)
# data = {
#     "input_text": [
#         "classify: The movie was amazing!",
#         "classify: I didn't like the product.",
#         "classify: The service was excellent.",
#     ],
#     "target_text": ["Positive", "Negative", "Positive"],
# }

# Define  real dataset
data = {
    "input_text": input_texts ,
    "target_text": target_texts,
}


# Create a Dataset object
dataset = Dataset.from_dict(data)

# Split into train and validation sets
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
valid_dataset = dataset["test"]


# Tokenize the data
def preprocess_data(example):

    inputs = tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=16)
    targets = tokenizer(example["target_text"], padding="max_length", truncation=True, max_length=16)

    print('target texte' , example["target_text"])
   
    inputs["labels"] = targets["input_ids"]
    return inputs

# Map preprocessing function to datasets
train_dataset = train_dataset.map(preprocess_data, batched=True)
valid_dataset = valid_dataset.map(preprocess_data, batched=True)

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

for example in train_dataset:
    print(f"Input: {len(example['input_ids'])}, Target: {example['labels']}")

Map: 100%|██████████| 69/69 [00:00<00:00, 369.01 examples/s]


target texte ['winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_vrb', 'winds_nou', 'winds_vrb', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_vrb', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_vrb', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_vrb', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_vrb', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou']


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

target texte ['winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou', 'winds_nou']


Map: 100%|██████████| 18/18 [00:00<00:00, 204.55 examples/s]

Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 208, 52, 115, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 208, 52, 115, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input: 16, Target: [13551, 834, 15358, 1, 0, 0, 0, 0, 0, 0, 0,




In [35]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir = "./LanguageModels/T5/model/t5-fine-tuned",
    evaluation_strategy = "epoch",
    learning_rate = 5e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 100,
    weight_decay = 0.01,
    save_steps = 500,
    save_total_limit = 2,
    # logging_dir = "./logs",
    logging_steps = 10,
    logging_first_step = True,
    # predict_with_generate=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    # data_collator=data_collator, 
)

for batch in trainer.get_train_dataloader():
    print(f"Input Batch Shape: {batch['input_ids'].shape}")
    print(f"Label Batch Shape: {batch['labels'].shape}")
    break


# Start training
trainer.train()


  trainer = Trainer(


Input Batch Shape: torch.Size([16, 16])
Label Batch Shape: torch.Size([16, 16])


Epoch,Training Loss,Validation Loss
1,2.1899,1.337901
2,2.0613,1.116598
3,2.0613,0.917605
4,1.7547,0.737129
5,1.7547,0.591374
6,1.2717,0.461593
7,1.2717,0.361116
8,1.0321,0.276275
9,1.0321,0.2018
10,0.7115,0.152891


TrainOutput(global_step=500, training_loss=0.20270701378583908, metrics={'train_runtime': 4119.0725, 'train_samples_per_second': 1.675, 'train_steps_per_second': 0.121, 'total_flos': 29183075942400.0, 'train_loss': 0.20270701378583908, 'epoch': 100.0})

In [48]:


# Example inference
def predict_wordid(sentence):
    input_ids = tokenizer(sentence, return_tensors="pt", max_length=32, truncation=True).input_ids
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


input_texts = []
target_texts = []

# Open and read the TSV file
with open("HomographData/train/winds.tsv", "r", encoding="utf-8") as file:
    reader = csv.DictReader(file, delimiter="\t")
    for row in reader:
        # Construct input_text and target_text
        startpos = int(row["start"])
        endpos = int(row["end"])
        input_text = row["sentence"][:startpos] + "<" + row["homograph"] + ">" + row["sentence"][endpos:]
        target_text = row["wordid"]

        # print('input_text' ,input_text)

        # Add to respective lists
        input_texts.append(input_text)
        target_texts.append(target_text)


# Test the fine-tuned model
# sentence = input_texts[1]  # Example input
# wordid = predict_wordid(sentence)
# print(f"sentence: {sentence}, predicted wordid: {wordid}")

nRight = 0
nWrong = 0

for index , sentence in enumerate(input_texts):
    # print('sentance' , sentence)
    wordid = predict_wordid(sentence)
    if wordid == target_texts[index]:
        nRight += 1
    else:
        print('predicted wordid: ' , wordid , 'target' , target_texts[index]  , 'sentence: ' , sentence)
        nWrong +=1

print('right' , nRight , 'nWrong' , nWrong)        



# # Tokenize the validation dataset input
# valid_inputs = tokenizer(
#     train_dataset["input_text"], 
#     padding="max_length", 
#     truncation=True, 
#     max_length=64, 
#     return_tensors="pt"
# )

# # Generate predictions
# outputs = model.generate(
#     input_ids=valid_inputs["input_ids"], 
#     attention_mask=valid_inputs["attention_mask"], 
#     max_length=64
# )

# # Decode predictions
# decoded_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# # Display predictions
# for i, prediction in enumerate(decoded_predictions):
#     print(f"Validation Example {i}: {prediction}")




predicted wordid:  winds_nou target winds_vrb sentence:  He sees Isabelle mingling, but bumps straight into Kay, an ex-girlfriend, who <winds> him up, lambasting him.
predicted wordid:  winds_nou target winds_vrb sentence:  The continuation of the street <winds> up to the tomb of Samuel the prophet, after whom the street is named.
predicted wordid:  winds_nou target winds_vrb sentence:  Besides the "big, wild, and connected" lands, ACT proposed the Hogtown Creek Greenway, which <winds> through western Gainesville.
predicted wordid:  winds_nou target winds_vrb sentence:  The road <winds> through the scenic, mountainous country of New Hampshire north of the White Mountain National Forest.
right 83 nWrong 4


In [38]:
model.save_pretrained("./t5-fine-tuned-model")
tokenizer.save_pretrained("./t5-tokenizer")


('./t5-tokenizer\\tokenizer_config.json',
 './t5-tokenizer\\special_tokens_map.json',
 './t5-tokenizer\\spiece.model',
 './t5-tokenizer\\added_tokens.json')

In [11]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# # Load fine-tuned model
# model = T5ForConditionalGeneration.from_pretrained("./t5-fine-tuned")
# tokenizer = T5Tokenizer.from_pretrained("./t5-fine-tuned")

# # Test with new input
# input_text = "classify: The product was okay but could be better."
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids
# outputs = model.generate(input_ids)
# predicted_label = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print(f"Predicted Label: {predicted_label}")
