In [1]:
path = 'data/'

In [2]:
import pandas as pd
import ast
from datasets import Dataset
from collections import Counter

train_df = pd.read_csv(path+"Labelled_Training_Data_Preprocessed.csv")
train_df = train_df[['comment', 'aspect_term', 'Sentiment']]
train_df = train_df.rename(columns={'comment': 'Sentence', 'aspect_term': 'Term', 'Sentiment': 'Sentiment'})
train_df = train_df.iloc[:3500]
# Assuming your DataFrame column is named 'Sentiment'
mapping_dict = {0: 'neutral', 1: 'positive', -1: 'negative'}

# Map values in the 'Sentiment' column using the mapping dictionary
train_df['Sentiment'] = train_df['Sentiment'].map(mapping_dict)





print("Polarity Counts:")
print(train_df.Sentiment.value_counts())

dataset = Dataset.from_pandas(train_df)
print("-"*10)
dataset

Polarity Counts:
Sentiment
neutral     1863
negative     895
positive     742
Name: count, dtype: int64
----------


Dataset({
    features: ['Sentence', 'Term', 'Sentiment'],
    num_rows: 3500
})

In [3]:
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
import torch


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
# huggingface hub model id
# model_id = "google/flan-t5-base"
model_id = "google/flan-t5-large"
# model_id = "shorthillsai/flan-t5-large-absa"
# model_id = "Shakhovak/flan-t5-large-absa-rest"
# model_id="google/flan-t5-xl"
# model_id = path+"t5 atpc"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto") #load_in_8bit=True, torch_dtype=torch.bfloat16,

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_id="google/flan-t5-large"

# Load tokenizer of FLAN-t5-XL
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [5]:
# start_token = "[START]"
sep_token = "[NEXT]"
end_token = "[END]"

In [6]:
sep_token

'[NEXT]'

In [7]:
from random import randrange, seed

# Set a seed for reproducibility
seed_value = 12  # You can choose any seed value
seed(seed_value)

# Select three random indices from the dataset
selected_indices = [randrange(len(dataset)) for _ in range(3)]
# Format the three randomly chosen examples using selected_indices
examples = ""
for i in selected_indices:
    example = dataset[i]
    formatted_text = example['Sentence'].strip() + "\n"
    formatted_terms = "Aspect Term: "+(example['Term'].strip()) + "\n"
    formatted_terms += "Sentiment: " + example['Sentiment'].lower() + "\n"
    examples += formatted_text + formatted_terms + "\n"

def input_format(sample):
    # Combine the examples with the current sample
    formatted_input = f"""
{examples[:-1]}
{sample['Sentence'].strip()}
Aspect Term: {sample['Term'].strip()}
Sentiment: """ #{examples[:-1]}
    sample['input'] = formatted_input.lstrip()
    sample['output'] = sample['Sentiment'].lower()
    return sample

ran = randrange(len(dataset))
print(input_format(dataset[ran])['input'])
print(input_format(dataset[ran])['output'])

ashlee decides he is a lesbian trans must other lesbians accept his kkj as part of the deal or be condemned as transphobic ?
Aspect Term: trans
Sentiment: neutral

doesn t mean a thing imagine using this argument for section 377a .
Aspect Term: 377a
Sentiment: neutral

we don t know how many of the population opposes it , unlike 377a where we have ipos survey we have a survey showing that it s not a monolithic majority .
Aspect Term: 377a
Sentiment: neutral

i do see myself as being a moderate , but leaning towards affording the same rights .
Aspect Term: rights
Sentiment: 
positive


In [8]:
dataset = dataset.map(input_format)
dataset

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Dataset({
    features: ['Sentence', 'Term', 'Sentiment', 'input', 'output'],
    num_rows: 3500
})

In [9]:
from datasets import concatenate_datasets
import numpy as np
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = dataset.map(lambda x: tokenizer(x["input"], truncation=True), batched=True, remove_columns=['Sentence', 'Term', 'Sentiment', 'input', 'output'])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 95))
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = dataset.map(lambda x: tokenizer(x["output"], truncation=True), batched=True, remove_columns=['Sentence', 'Term', 'Sentiment', 'input', 'output'])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 100))
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Max source length: 223


Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Max target length: 2


In [10]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["input"]] #"For the following sentence, identify and output aspect terms with their sentiment polarities: " +

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True) #+20

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["output"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['Sentence', 'Term', 'Sentiment', 'input', 'output'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}")

# save datasets to disk for later easy loading
# tokenized_dataset["train"].save_to_disk("data/train")
# tokenized_dataset["test"].save_to_disk("data/eval")


Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [11]:
from datasets import DatasetDict

# Suppose tokenized_dataset is your Dataset object.
# This is how you'd split it, with a 10% split for 'eval' and 90% for 'train'.
split_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=seed_value)

# Now, create a DatasetDict to keep your 'train' and 'eval' splits organized
final_datasets = DatasetDict({
    'train': split_datasets['train'],
    'eval': split_datasets['test']  # 'test' is the default name for the second split
})

In [12]:
final_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2800
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 700
    })
})

In [13]:
del(dataset)

In [14]:
import numpy as np
import ast
from sklearn.metrics import f1_score


def get_metrics(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

def extract_aspects_sentiments(decoded_output):
    sent = ['negative','neutral','positive']
    if decoded_output in sent:
      return decoded_output
    else:
      return "invalid"

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Check the structure of preds
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # preds = np.argmax(preds, axis=-1)
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    # print(preds)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_preds = [extract_aspects_sentiments(line) for line in decoded_preds]
    decoded_labels = [extract_aspects_sentiments(line) for line in decoded_labels]

    # Calculate F1, precision, and recall
    f1 = get_metrics(decoded_labels, decoded_preds)

    # Prepare the result dictionary
    result = {"f1": f1}

    return result

In [15]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [16]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=128,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 787,868,672 || trainable%: 0.5989059049678777


In [17]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import GenerationConfig

output_dir="lora-flan-t5-xl"

# generation_config= GenerationConfig(do_sample=False, max_new_tokens = 200)


# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	  per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=20,
    learning_rate=5e-5,
    # warmup_ratio=0.05,
    # weight_decay=0.1,
    # warmup_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim = 'adafactor',
    # lr_scheduler_type = 'linear',
    # save_total_limit=1,
    predict_with_generate=True,
    # learning_rate=1e-4, # higher learning rate
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    # logging_steps=50,
    seed=seed_value,
    generation_max_length = max_source_length+max_target_length+5,
    # generation_config = generation_config,
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=final_datasets['train'],
    eval_dataset=final_datasets['eval'],
    compute_metrics=compute_metrics,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
# train model
trainer.train()

  0%|          | 0/5616 [00:00<?, ?it/s]



{'loss': 0.5029, 'grad_norm': 5.726001262664795, 'learning_rate': 4.791666666666667e-05, 'epoch': 1.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.32466182112693787, 'eval_f1': 0.724978680452855, 'eval_runtime': 79.3885, 'eval_samples_per_second': 8.817, 'eval_steps_per_second': 0.743, 'epoch': 1.0}




{'loss': 0.3645, 'grad_norm': 7.2669453620910645, 'learning_rate': 4.5833333333333334e-05, 'epoch': 2.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.2923431098461151, 'eval_f1': 0.7303703798671316, 'eval_runtime': 79.5699, 'eval_samples_per_second': 8.797, 'eval_steps_per_second': 0.741, 'epoch': 2.0}




{'loss': 0.3523, 'grad_norm': 6.77427339553833, 'learning_rate': 4.375e-05, 'epoch': 3.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.2784854471683502, 'eval_f1': 0.7518244513578, 'eval_runtime': 79.5517, 'eval_samples_per_second': 8.799, 'eval_steps_per_second': 0.742, 'epoch': 3.0}




{'loss': 0.3087, 'grad_norm': 6.742316722869873, 'learning_rate': 4.166666666666667e-05, 'epoch': 4.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.2698214650154114, 'eval_f1': 0.7535313284611225, 'eval_runtime': 79.6107, 'eval_samples_per_second': 8.793, 'eval_steps_per_second': 0.741, 'epoch': 4.0}




{'loss': 0.3072, 'grad_norm': 8.960515975952148, 'learning_rate': 3.958333333333333e-05, 'epoch': 5.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.27339327335357666, 'eval_f1': 0.7543295915462322, 'eval_runtime': 79.3397, 'eval_samples_per_second': 8.823, 'eval_steps_per_second': 0.744, 'epoch': 5.0}




{'loss': 0.2934, 'grad_norm': 12.006804466247559, 'learning_rate': 3.7500000000000003e-05, 'epoch': 6.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.28179171681404114, 'eval_f1': 0.7627014851906083, 'eval_runtime': 79.5199, 'eval_samples_per_second': 8.803, 'eval_steps_per_second': 0.742, 'epoch': 6.0}




{'loss': 0.2683, 'grad_norm': 6.27375602722168, 'learning_rate': 3.541666666666667e-05, 'epoch': 7.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.2942456305027008, 'eval_f1': 0.7644527404254798, 'eval_runtime': 79.4066, 'eval_samples_per_second': 8.815, 'eval_steps_per_second': 0.743, 'epoch': 7.0}




{'loss': 0.2602, 'grad_norm': 7.800886154174805, 'learning_rate': 3.3333333333333335e-05, 'epoch': 8.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.29425156116485596, 'eval_f1': 0.7609909265690411, 'eval_runtime': 79.4931, 'eval_samples_per_second': 8.806, 'eval_steps_per_second': 0.742, 'epoch': 8.0}




{'loss': 0.2625, 'grad_norm': 2.459376811981201, 'learning_rate': 3.125e-05, 'epoch': 9.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.2961713969707489, 'eval_f1': 0.767504851917766, 'eval_runtime': 79.4438, 'eval_samples_per_second': 8.811, 'eval_steps_per_second': 0.743, 'epoch': 9.0}




{'loss': 0.248, 'grad_norm': 1.5121386051177979, 'learning_rate': 2.916666666666667e-05, 'epoch': 10.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.285238653421402, 'eval_f1': 0.771800163917754, 'eval_runtime': 79.4681, 'eval_samples_per_second': 8.809, 'eval_steps_per_second': 0.742, 'epoch': 10.0}




{'loss': 0.2262, 'grad_norm': 1.17067551612854, 'learning_rate': 2.7083333333333332e-05, 'epoch': 11.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.2916419804096222, 'eval_f1': 0.7705803609458094, 'eval_runtime': 79.579, 'eval_samples_per_second': 8.796, 'eval_steps_per_second': 0.741, 'epoch': 11.0}




{'loss': 0.2265, 'grad_norm': 8.155871391296387, 'learning_rate': 2.5e-05, 'epoch': 12.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.296079158782959, 'eval_f1': 0.7716429237605836, 'eval_runtime': 79.6317, 'eval_samples_per_second': 8.79, 'eval_steps_per_second': 0.741, 'epoch': 12.0}




{'loss': 0.2192, 'grad_norm': 14.352913856506348, 'learning_rate': 2.2916666666666667e-05, 'epoch': 13.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.295646071434021, 'eval_f1': 0.7653789300130764, 'eval_runtime': 79.4345, 'eval_samples_per_second': 8.812, 'eval_steps_per_second': 0.743, 'epoch': 13.0}




{'loss': 0.213, 'grad_norm': 1.6332697868347168, 'learning_rate': 2.0833333333333336e-05, 'epoch': 14.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.28471067547798157, 'eval_f1': 0.7653790900451249, 'eval_runtime': 79.385, 'eval_samples_per_second': 8.818, 'eval_steps_per_second': 0.743, 'epoch': 14.0}




{'loss': 0.2088, 'grad_norm': 7.410924434661865, 'learning_rate': 1.8750000000000002e-05, 'epoch': 15.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.27957475185394287, 'eval_f1': 0.776123496574559, 'eval_runtime': 79.3657, 'eval_samples_per_second': 8.82, 'eval_steps_per_second': 0.743, 'epoch': 15.0}




{'loss': 0.1986, 'grad_norm': 12.18996524810791, 'learning_rate': 1.6666666666666667e-05, 'epoch': 16.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.3079544007778168, 'eval_f1': 0.764416330086323, 'eval_runtime': 79.4725, 'eval_samples_per_second': 8.808, 'eval_steps_per_second': 0.742, 'epoch': 16.0}




{'loss': 0.1907, 'grad_norm': 4.857995510101318, 'learning_rate': 1.4583333333333335e-05, 'epoch': 17.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.3005698323249817, 'eval_f1': 0.7710936243136431, 'eval_runtime': 79.6167, 'eval_samples_per_second': 8.792, 'eval_steps_per_second': 0.741, 'epoch': 17.0}




{'loss': 0.1855, 'grad_norm': 13.741676330566406, 'learning_rate': 1.25e-05, 'epoch': 18.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.3023965358734131, 'eval_f1': 0.7767910294008057, 'eval_runtime': 79.5507, 'eval_samples_per_second': 8.799, 'eval_steps_per_second': 0.742, 'epoch': 18.0}




{'loss': 0.1854, 'grad_norm': 0.5029414296150208, 'learning_rate': 1.0416666666666668e-05, 'epoch': 19.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.31690746545791626, 'eval_f1': 0.7714144565134601, 'eval_runtime': 79.5463, 'eval_samples_per_second': 8.8, 'eval_steps_per_second': 0.742, 'epoch': 19.0}




{'loss': 0.1796, 'grad_norm': 15.396968841552734, 'learning_rate': 8.333333333333334e-06, 'epoch': 20.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.32325026392936707, 'eval_f1': 0.7683311795252687, 'eval_runtime': 78.4106, 'eval_samples_per_second': 8.927, 'eval_steps_per_second': 0.752, 'epoch': 20.0}




{'loss': 0.1838, 'grad_norm': 8.27694034576416, 'learning_rate': 6.25e-06, 'epoch': 21.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.32427266240119934, 'eval_f1': 0.7700521811719563, 'eval_runtime': 78.8117, 'eval_samples_per_second': 8.882, 'eval_steps_per_second': 0.749, 'epoch': 21.0}




{'loss': 0.1837, 'grad_norm': 1.1660550832748413, 'learning_rate': 4.166666666666667e-06, 'epoch': 22.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.3275129497051239, 'eval_f1': 0.7651863417693398, 'eval_runtime': 77.4448, 'eval_samples_per_second': 9.039, 'eval_steps_per_second': 0.762, 'epoch': 22.0}




{'loss': 0.1704, 'grad_norm': 3.1274309158325195, 'learning_rate': 2.0833333333333334e-06, 'epoch': 23.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.32231393456459045, 'eval_f1': 0.7646982991489509, 'eval_runtime': 78.9247, 'eval_samples_per_second': 8.869, 'eval_steps_per_second': 0.748, 'epoch': 23.0}




{'loss': 0.1666, 'grad_norm': 1.3683775663375854, 'learning_rate': 0.0, 'epoch': 24.0}


  0%|          | 0/59 [00:00<?, ?it/s]

{'eval_loss': 0.32264062762260437, 'eval_f1': 0.7687774413592988, 'eval_runtime': 84.3378, 'eval_samples_per_second': 8.3, 'eval_steps_per_second': 0.7, 'epoch': 24.0}
{'train_runtime': 14384.097, 'train_samples_per_second': 4.672, 'train_steps_per_second': 0.39, 'train_loss': 0.2460821082449367, 'epoch': 24.0}


TrainOutput(global_step=5616, training_loss=0.2460821082449367, metrics={'train_runtime': 14384.097, 'train_samples_per_second': 4.672, 'train_steps_per_second': 0.39, 'train_loss': 0.2460821082449367, 'epoch': 24.0})

In [19]:
import os
name = "t5 first"
os.makedirs(name, exist_ok=True)
trainer.model.save_pretrained(name)
tokenizer.save_pretrained(name)

('t5 first\\tokenizer_config.json',
 't5 first\\special_tokens_map.json',
 't5 first\\spiece.model',
 't5 first\\added_tokens.json',
 't5 first\\tokenizer.json')

In [20]:
# torch.save(model.state_dict(), './t5 first/pytorch_model.bin')

In [21]:
import shutil

# Source directory path
source_dir = "t5 first"
# Destination directory path
destination_dir = "t5 atpc xl"

# Copy the entire folder
shutil.copytree(source_dir, destination_dir)

't5 atpc xl'

In [22]:
del(trainer)
del(model)
torch.cuda.empty_cache ()

In [23]:
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForSeq2SeqLM
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, set_seed,  BitsAndBytesConfig

In [24]:
path = ''

In [25]:
import os
model = AutoPeftModelForSeq2SeqLM.from_pretrained("t5 first", torch_dtype=torch.bfloat16, device_map="auto")  #, torch_dtype=torch.bfloat16
# model = model.merge_and_unload()
# os.makedirs("saved", exist_ok=True)
# # model.to(torch.float32)
# model.save_pretrained("saved", safe_serialization=True)
# del model

In [26]:
import pandas as pd
import ast
import torch

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5 first", trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

In [28]:
import pandas as pd
import ast
from collections import Counter

def inf_format(text, term):
    return f'''
{examples[:-1]}
{text}
Aspect Term: {term.strip()}
Sentiment: '''.lstrip() #{examples}

test_df = pd.read_csv("data/Labelled_Training_Data_Preprocessed.csv")
test_df = test_df[['comment', 'aspect_term', 'Sentiment']]
test_df = test_df.rename(columns={'comment': 'Sentence', 'aspect_term': 'Term', 'Sentiment': 'Sentiment'})
test_df = test_df.iloc[3500:]

# Assuming your DataFrame column is named 'Sentiment'
mapping_dict = {0: 'neutral', 1: 'positive', -1: 'negative'}

# Map values in the 'Sentiment' column using the mapping dictionary
test_df['Sentiment'] = test_df['Sentiment'].map(mapping_dict)


print("Polarity Counts in Test Data:")
print(test_df.Sentiment.value_counts())

Polarity Counts in Test Data:
Sentiment
neutral     788
negative    395
positive    317
Name: count, dtype: int64


In [29]:
test_df

Unnamed: 0,Sentence,Term,Sentiment
3500,singapore is after all a pioneer on trans righ...,trans,positive
3501,there s no regardless of sexual orientation in...,sexual orientation,neutral
3502,isn t it unfair that it s easier to be a strai...,homosexual,negative
3503,starting with this one small group speaking ou...,rights,positive
3504,just continue to pressure pap and they will gi...,same sex marriage,positive
...,...,...,...
4995,"also , if the 377a is repealed , the next step...",gay marriage,positive
4996,but i think he will not bother since 377a he a...,377a,negative
4997,"singles can purchase 2 room housing , which sh...",housing,neutral
4998,we don t know how many of the population oppos...,377a,neutral


In [30]:
from tqdm import tqdm
import torch

device = "cuda:0"
model.eval()

sent = ['negative','neutral','positive']

def extract_aspects_sentiments(decoded_output):
    decoded_output = decoded_output.strip()
    if decoded_output in sent:
      return decoded_output
    else:
      print("Invalid output: ", decoded_output)
      return "invalid"

def generate_output(inputs, max_tokens=50):
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs, max_new_tokens=max_tokens) #, temperature=0.1, top_p=1, do_sample= True, top_k=0
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output

for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    text = row['Sentence']
    term = row['Term']
    prompt = inf_format(text, term)
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    decoded_output = generate_output(inputs, max_tokens=200)
    # print(decoded_output)
    # break
    aspect_sentiments = extract_aspects_sentiments(decoded_output)
    test_df.at[index, 'predicted'] = str(aspect_sentiments)

print("Done")

  0%|          | 0/1500 [00:00<?, ?it/s]

100%|██████████| 1500/1500 [04:47<00:00,  5.22it/s]

Done





In [31]:
test_df['Sentiment'].value_counts()

Sentiment
neutral     788
negative    395
positive    317
Name: count, dtype: int64

In [32]:
test_df['Sentiment'] = test_df.apply(lambda row: row['Sentiment'].lower(), axis=1)

In [33]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(test_df['Sentiment'], test_df['predicted'], digits=2))

              precision    recall  f1-score   support

    negative       0.73      0.81      0.77       395
     neutral       0.86      0.78      0.82       788
    positive       0.72      0.81      0.76       317

    accuracy                           0.79      1500
   macro avg       0.77      0.80      0.78      1500
weighted avg       0.80      0.79      0.79      1500



In [34]:
import pandas as pd
import ast
from collections import Counter

def inf_format(text, term):
    return f'''
{examples[:-1]}
{text}
Aspect Term: {term.strip()}
Sentiment: '''.lstrip() #{examples}

label_df = pd.read_csv("data/Rest_of_data.csv")
label_df = label_df.rename(columns={'comment': 'Sentence', 'aspect_term': 'Term'})




In [35]:
from tqdm import tqdm
import torch

device = "cuda:0"
model.eval()

sent = ['negative','neutral','positive']

def extract_aspects_sentiments(decoded_output):
    decoded_output = decoded_output.strip()
    if decoded_output in sent:
      return decoded_output
    else:
      print("Invalid output: ", decoded_output)
      return "invalid"

def generate_output(inputs, max_tokens=50):
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs, max_new_tokens=max_tokens) #, temperature=0.1, top_p=1, do_sample= True, top_k=0
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output

for index, row in tqdm(label_df.iterrows(), total=label_df.shape[0]):
    text = row['Sentence']
    term = row['Term']
    prompt = inf_format(text, term)
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    decoded_output = generate_output(inputs, max_tokens=200)
    # print(decoded_output)
    # break
    aspect_sentiments = extract_aspects_sentiments(decoded_output)
    label_df.at[index, 'Sentiment'] = str(aspect_sentiments)

print("Done")

  0%|          | 0/20202 [00:00<?, ?it/s]

  0%|          | 78/20202 [00:15<1:05:38,  5.11it/s]


KeyboardInterrupt: 

In [None]:
# Assuming your DataFrame column is named 'Sentiment'
mapping_dict = {'neutral': 0,'positive': 1, 'negative': -1}

# Map values in the 'Sentiment' column using the mapping dictionary
label_df['Sentiment'] = label_df['Sentiment'].map(mapping_dict)

label_df.to_csv("data/Rest_of_data_labelled.csv", index=False)

In [None]:
inputs = tokenizer("What is the sentiment of the term Laptop in the sentence: Laptop is good but phone is bad", return_tensors="pt").input_ids.to(device)

decoded_output = generate_output(inputs, max_tokens=150)

print(decoded_output)

positive


In [None]:
inputs = tokenizer("What is the full form of trans in Singapore?", return_tensors="pt").input_ids.to(device)

decoded_output = generate_output(inputs, max_tokens=150)

print(decoded_output)

transgender


In [None]:
# from google.colab import runtime
# runtime.unassign()

In [None]:
# from google.colab import runtime
# runtime.unassign()