In [92]:
pip install transformers



In [93]:
import pandas as pd, os
pd.set_option('display.max_colwidth', None)

FILE_NAME = "eng_laptop_train_alltasks.jsonl"

file_name = os.path.join('/content', FILE_NAME)
data = pd.read_json(open(file_name, encoding='utf8'), lines=True)
data


Unnamed: 0,ID,Text,Quadruplet
0,laptop_quad_dev_1,"this unit is ` ` pretty ` ` and stylish , so my high school daughter was attracted to it for that reason .","[{'Aspect': 'unit', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'pretty', 'VA': '7.12#7.12'}, {'Aspect': 'unit', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'stylish', 'VA': '7.12#7.12'}]"
1,laptop_quad_dev_2,for now i ' m okay with upping the experience & device to 3 out of 5 stars .,"[{'Aspect': 'device', 'Category': 'LAPTOP#GENERAL', 'Opinion': 'NULL', 'VA': '5.50#5.25'}]"
2,laptop_quad_dev_3,"seems unlikely but whatever , i ' ll go with it .","[{'Aspect': 'NULL', 'Category': 'LAPTOP#GENERAL', 'Opinion': 'NULL', 'VA': '5.00#5.12'}]"
3,laptop_quad_dev_4,this version has been my least favorite version i ' ve had for the following reasons listed bellow the pros .,"[{'Aspect': 'version', 'Category': 'LAPTOP#GENERAL', 'Opinion': 'least favorite', 'VA': '3.30#6.60'}]"
4,laptop_quad_dev_5,- biggest disappointment is the track pad .,"[{'Aspect': 'track pad', 'Category': 'HARDWARE#GENERAL', 'Opinion': 'disappointment', 'VA': '2.50#6.00'}]"
...,...,...,...
4071,laptop_quad_train_2930,i think the sound could be better .,"[{'Aspect': 'sound', 'Category': 'MULTIMEDIA_DEVICES#OPERATION_PERFORMANCE', 'Opinion': 'could be better', 'VA': '4.17#5.50'}]"
4072,laptop_quad_train_2931,"i didn ' t want to spend much on this as it is my first laptop , but i was convinced by a few reviews to upgrade to this model as it has more ram and performs better in several areas .","[{'Aspect': 'performs', 'Category': 'LAPTOP#OPERATION_PERFORMANCE', 'Opinion': 'better', 'VA': '7.00#6.88'}]"
4073,laptop_quad_train_2932,"an awesome product , well built - well worth your time and money .","[{'Aspect': 'product', 'Category': 'LAPTOP#GENERAL', 'Opinion': 'awesome', 'VA': '7.50#7.50'}, {'Aspect': 'product', 'Category': 'LAPTOP#QUALITY', 'Opinion': 'well built', 'VA': '6.83#7.00'}, {'Aspect': 'product', 'Category': 'LAPTOP#QUALITY', 'Opinion': 'well worth', 'VA': '7.00#7.17'}]"
4074,laptop_quad_train_2933,backlit keyboard is great ; feels sturdy ; fast processing .,"[{'Aspect': 'backlit keyboard', 'Category': 'KEYBOARD#GENERAL', 'Opinion': 'great', 'VA': '7.67#7.83'}, {'Aspect': 'NULL', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'sturdy', 'VA': '7.83#8.17'}, {'Aspect': 'NULL', 'Category': 'CPU#OPERATION_PERFORMANCE', 'Opinion': 'fast', 'VA': '8.00#8.33'}]"


## Prepare Seq2Seq Data

### Subtask:
Transform the loaded dataset into a seq2seq format. This involves converting each sentence's quadruplets into a target string of aspect-opinion pairs (e.g., 'aspect1#opinion1|aspect2#opinion2'), suitable for a seq2seq model. Handle cases where aspects or opinions are 'NULL'.


In [94]:
def process_quadruplets_to_target_string(quadruplets):
    target_parts = []
    for q in quadruplets:
        aspect = q.get('Aspect', 'NULL')
        opinion = q.get('Opinion', 'NULL')

        # Handle 'NULL' values by replacing them with an empty string
        #processed_aspect = aspect if aspect != 'NULL' else ''
        #processed_opinion = opinion if opinion != 'NULL' else ''

        target_parts.append(f"{aspect}#{opinion}")

    return '|'.join(target_parts)

# Apply the function to create the 'TargetString' column
data['TargetString'] = data['Quadruplet'].apply(process_quadruplets_to_target_string)

# Display the first few rows with the new 'TargetString' column
display(data[['Text', 'Quadruplet', 'TargetString']].head())

Unnamed: 0,Text,Quadruplet,TargetString
0,"this unit is ` ` pretty ` ` and stylish , so my high school daughter was attracted to it for that reason .","[{'Aspect': 'unit', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'pretty', 'VA': '7.12#7.12'}, {'Aspect': 'unit', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'stylish', 'VA': '7.12#7.12'}]",unit#pretty|unit#stylish
1,for now i ' m okay with upping the experience & device to 3 out of 5 stars .,"[{'Aspect': 'device', 'Category': 'LAPTOP#GENERAL', 'Opinion': 'NULL', 'VA': '5.50#5.25'}]",device#NULL
2,"seems unlikely but whatever , i ' ll go with it .","[{'Aspect': 'NULL', 'Category': 'LAPTOP#GENERAL', 'Opinion': 'NULL', 'VA': '5.00#5.12'}]",NULL#NULL
3,this version has been my least favorite version i ' ve had for the following reasons listed bellow the pros .,"[{'Aspect': 'version', 'Category': 'LAPTOP#GENERAL', 'Opinion': 'least favorite', 'VA': '3.30#6.60'}]",version#least favorite
4,- biggest disappointment is the track pad .,"[{'Aspect': 'track pad', 'Category': 'HARDWARE#GENERAL', 'Opinion': 'disappointment', 'VA': '2.50#6.00'}]",track pad#disappointment


## Initialize Tokenizer and Model

### Subtask:
Load a pre-trained tokenizer (e.g., from T5 or BART) and an appropriate encoder-decoder model from the `transformers` library. Configure the tokenizer for both input text and target sequences.


**Reasoning**:
I need to initialize a tokenizer and an encoder-decoder model as per the subtask instructions. I will import the necessary classes from the `transformers` library and load a pre-trained 't5-small' model and its corresponding tokenizer.



In [95]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Choose a pre-trained model checkpoint
MODEL_CHECKPOINT = 't5-small'

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
print(f"Tokenizer '{MODEL_CHECKPOINT}' loaded successfully.")

# Initialize the Seq2Seq model
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
print(f"Model '{MODEL_CHECKPOINT}' loaded successfully.")

# Optionally, print the model and tokenizer to verify
print("\nTokenizer details:", tokenizer)
print("\nModel details:", model)

Tokenizer 't5-small' loaded successfully.
Model 't5-small' loaded successfully.

Tokenizer details: T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_

**Reasoning**:
The previous step successfully loaded the tokenizer and model. The next logical step is to preprocess the text and target strings using the tokenizer. This involves defining a tokenization function that prepares both the input `Text` and the `TargetString` for the model, including padding, truncation, and creating attention masks.



In [96]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples['Text'], max_length=512, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['TargetString'], max_length=512, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the tokenization function to the data. We'll convert the pandas DataFrame to a Hugging Face Dataset first.
from datasets import Dataset

hf_dataset = Dataset.from_pandas(data)
tokenized_data = hf_dataset.map(tokenize_function, batched=True)

print("Tokenized data example (first entry):")
print(tokenized_data[0])
print("\nDecoded input_ids:", tokenizer.decode(tokenized_data[0]['input_ids']))
print("Decoded labels:", tokenizer.decode(tokenized_data[0]['labels']))

Map:   0%|          | 0/4076 [00:00<?, ? examples/s]



Tokenized data example (first entry):
{'ID': 'laptop_quad_dev_1', 'Text': 'this unit is ` ` pretty ` ` and stylish , so my high school daughter was attracted to it for that reason .', 'Quadruplet': [{'Aspect': 'unit', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'pretty', 'VA': '7.12#7.12'}, {'Aspect': 'unit', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'stylish', 'VA': '7.12#7.12'}], 'TargetString': 'unit#pretty|unit#stylish', 'input_ids': [48, 1745, 19, 3, 2, 3, 2, 1134, 3, 2, 3, 2, 11, 7695, 3, 6, 78, 82, 306, 496, 3062, 47, 44, 11674, 12, 34, 21, 24, 1053, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

**Reasoning**:
The tokenization step completed successfully, converting the raw text and target strings into token IDs. The next logical step is to define a data collator. This will be used to dynamically pad sequences to the longest length in a batch and ensure that the input IDs, attention masks, and labels are correctly formatted for model training. This is a crucial step in preparing the data for a Seq2Seq model.



In [97]:
from transformers import DataCollatorForSeq2Seq

# Initialize the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

print("DataCollatorForSeq2Seq initialized successfully.")
print("Data collator details:", data_collator)

DataCollatorForSeq2Seq initialized successfully.
Data collator details: DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extr

## Create Dataset and DataLoader

### Subtask:
Develop a custom PyTorch `Dataset` to manage the tokenized input and target sequences. Then, create `DataLoader` instances for training and evaluation to handle batching, shuffling, and data iteration efficiently.


In [98]:
from torch.utils.data import DataLoader

# Split the tokenized_data into training and evaluation sets
train_test_split_dataset = tokenized_data.train_test_split(test_size=0.2)

train_dataset = train_test_split_dataset['train']
eval_dataset = train_test_split_dataset['test']

print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

BATCH_SIZE = 8 # Define a suitable batch size

# Create DataLoader instances
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    eval_dataset,
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

print(f"Train DataLoader created with batch size: {BATCH_SIZE}")
print(f"Evaluation DataLoader created with batch size: {BATCH_SIZE}")

# Optionally, inspect a batch
# for batch in train_dataloader:
#     print("Input IDs shape:", batch["input_ids"].shape)
#     print("Labels shape:", batch["labels"].shape)
#     break

Training dataset size: 3260
Evaluation dataset size: 816
Train DataLoader created with batch size: 8
Evaluation DataLoader created with batch size: 8


## Define Training Arguments and Trainer

### Subtask:
Set up training arguments including parameters like learning rate, number of epochs, batch size, and evaluation strategy. The `report_to` argument will be updated to exclude 'wandb'. Utilize the `Trainer` class from the `transformers` library for simplified training management.


In [99]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

# Define output directory
output_dir = './results'

# Define TrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=20,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    predict_with_generate=True,
    report_to=[], # Exclude wandb and other reporting services
    fp16=torch.cuda.is_available() # Enable FP16 if a GPU is available
)

# Instantiate the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("TrainingArguments and Trainer initialized successfully.")
print("Training Arguments:", training_args)
print("Trainer instance:", trainer)

TrainingArguments and Trainer initialized successfully.
Training Arguments: Seq2SeqTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
e

In [100]:
print("Starting model training...")
trainer.train()

Starting model training...


Epoch,Training Loss,Validation Loss
1,0.1138,0.092093
2,0.0275,0.017376
3,0.0174,0.013698
4,0.0177,0.012221
5,0.0165,0.011414
6,0.0114,0.010774
7,0.0109,0.010328
8,0.012,0.00988


KeyboardInterrupt: 

In [101]:
trainer.save_model('model')

In [102]:
predictions = trainer.predict(eval_dataset.select(range(5)))
display(eval_dataset.select(range(5)).to_pandas()[['ID', 'Text', 'Quadruplet']])

Epoch,Training Loss,Validation Loss
1,0.1138,0.092093
2,0.0275,0.017376
3,0.0174,0.013698
4,0.0177,0.012221
5,0.0165,0.011414
6,0.0114,0.010774
7,0.0109,0.010328
8,0.012,0.00988


Unnamed: 0,ID,Text,Quadruplet
0,laptop_quad_test_816,screen display is bright and has excellent graphics .,"[{'Aspect': 'screen display', 'Category': 'DISPLAY#OPERATION_PERFORMANCE', 'Opinion': 'bright', 'VA': '7.50#7.83'}, {'Aspect': 'graphics', 'Category': 'GRAPHICS#GENERAL', 'Opinion': 'excellent', 'VA': '8.00#8.33'}]"
1,laptop_quad_test_173,would definitely buy this again if i needed to do so .,"[{'Aspect': 'NULL', 'Category': 'LAPTOP#GENERAL', 'Opinion': 'NULL', 'VA': '5.50#5.62'}]"
2,laptop_quad_train_2174,"this laptop ' s construction is cheap and flimsy , the battery is not removable and the back case is nearly impossible to take off without damaging it .","[{'Aspect': 'laptop ' s construction', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'cheap', 'VA': '5.25#6.50'}, {'Aspect': 'laptop ' s construction', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'flimsy', 'VA': '5.50#6.75'}, {'Aspect': 'battery', 'Category': 'BATTERY#DESIGN_FEATURES', 'Opinion': 'not removable', 'VA': '4.00#5.75'}, {'Aspect': 'back case', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'NULL', 'VA': '4.25#5.25'}]"
3,laptop_quad_train_2831,i ' ve found the touch screen is pretty handy .,"[{'Aspect': 'touch screen', 'Category': 'DISPLAY#USABILITY', 'Opinion': 'pretty handy', 'VA': '7.33#7.33'}]"
4,laptop_quad_train_22,"up date per may 13 / 2018 about two months ago , the charger wont work .","[{'Aspect': 'charger', 'Category': 'POWER_SUPPLY#QUALITY', 'Opinion': 'NULL', 'VA': '4.38#5.38'}]"


In [103]:
pred_ids = predictions.predictions

# Handle tuple output (happens with some models)
if isinstance(pred_ids, tuple):
    pred_ids = pred_ids[0]

decoded_preds = tokenizer.batch_decode(
    pred_ids,
    skip_special_tokens=True
)
decoded_preds

['screen display#bright|screen display#excellent',
 'NULL#definitely',
 'laptop#NULL|back case#fastly impossible',
 'touch screen#pretty handy',
 'charger#NULL']