In [1]:
import pandas as pd
import numpy as np

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import sys

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, Trainer
from transformers import TrainingArguments
import time
import evaluate

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('C:/Users/sodjs/RL/data/sentimentdataset.csv')
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8


In [3]:
df1 = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Hashtags', 'Day', 'Hour'])
df1['Sentiment'] = df1['Sentiment'].str.lower().str.strip()

In [4]:
train_df, test_df = train_test_split(df1, test_size=0.1, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df[['Text', 'Sentiment']])
test_dataset = Dataset.from_pandas(test_df[['Text', 'Sentiment']])

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


In [5]:
dataset.shape

{'train': (658, 2), 'test': (74, 2)}

In [6]:
dataset.column_names

{'train': ['Text', 'Sentiment'], 'test': ['Text', 'Sentiment']}

In [7]:
example = [20, 50]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example):
    print(dash_line)
    print("Example ", i+1)
    print(dash_line)
    print(dataset['test'][index]['Text'])
    print(dash_line)
    print('Sentiment: ')
    print(dataset['test'][index]['Sentiment'])
    print(dash_line)
    print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
Attended a classical music concert, feeling the timeless melodies resonate. Music transcends generations. #ClassicalMusic #TimelessMelodies 
---------------------------------------------------------------------------------------------------
Sentiment: 
joy
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Example  2
---------------------------------------------------------------------------------------------------
 Dancing through life with the exuberance of a carefree spirit, embracing joy and zest at every turn. 
---------------------------------------------------------------------------------------------------
Sentiment: 
zest


In [8]:
model_name = "google/flan-t5-base"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [9]:
def print_no_trainable_param(model):
    trainable_param = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_param += param.numel()
    return f"Number of trainable model params: {trainable_param}\nAll model params: {all_param}\n{100*trainable_param/all_param}% of trainable params."
print(print_no_trainable_param(original_model))

Number of trainable model params: 247577856
All model params: 247577856
100.0% of trainable params.


## No Prompt Enginnering

In [10]:
for i, index in enumerate(example):
    text = dataset['test'][index]['Text'] \
    + "Emotion: "
    sentiment = dataset['test'][index]['Sentiment']

    input = tokenizer(text, return_tensors='pt')
    output = tokenizer.decode(
        original_model.generate(
            input['input_ids'],
            max_new_tokens=30
        )[0], skip_special_tokens=True
    )

    print(dash_line)
    print("Example ", i + 1)
    print(dash_line)
    print(f"INPUT PROMPT:\n{text}")
    print(dash_line)
    print(f"BASELINE GUIDED SENTIMENT:\n{sentiment}")
    print(dash_line)
    print(f"MODEL GENERATION - No Prompt Engineering:\n{output}\n")

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Attended a classical music concert, feeling the timeless melodies resonate. Music transcends generations. #ClassicalMusic #TimelessMelodies Emotion: 
---------------------------------------------------------------------------------------------------
BASELINE GUIDED SENTIMENT:
joy
---------------------------------------------------------------------------------------------------
MODEL GENERATION - No Prompt Engineering:
The music transcends generations

---------------------------------------------------------------------------------------------------
Example  2
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
 Dancing through life with the exuberance of a carefree spirit, embracing joy and zest at eve

## Few-shot Inference

In [11]:
def n_shot_prompt(example_full, example_index):
    prompt = ''
    for index in example_full:
        text = dataset['test'][index]['Text']
        sentiment = dataset['test'][index]['Sentiment']

        prompt += f"""
Post:

{text}

What is the sentiment from the text?
{sentiment}


"""
        text = dataset['test'][example_index]['Text']

        prompt += f"""
Post:

{text}

What is the sentiment from the text?
"""

    return prompt

In [12]:
example_full = [5]
example_index = 50

one_shot = n_shot_prompt(example_full, example_index)
print(one_shot)


Post:

Despite meticulous training, the swimmer faces disappointment as a split-second miscalculation costs them the lead in a crucial race. 

What is the sentiment from the text?
miscalculation



Post:

 Dancing through life with the exuberance of a carefree spirit, embracing joy and zest at every turn. 

What is the sentiment from the text?



In [13]:
Sentiment = dataset['test'][example_index]['Sentiment']

input = tokenizer(one_shot, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        input['input_ids'],
        max_new_tokens=50
    )[0],
    skip_special_tokens=True
)

print(dash_line)
print(f"BASELINE GUIDED Sentiment:\n{Sentiment}\n")
print(dash_line)
print(f"MODEL GENERATION - One-shot:\n{output}")

---------------------------------------------------------------------------------------------------
BASELINE GUIDED Sentiment:
zest

---------------------------------------------------------------------------------------------------
MODEL GENERATION - One-shot:
positive


In [14]:
example_full = [2, 5, 10, 15]
example_index = 50
three_shot_prompt = n_shot_prompt(example_full, example_index)
print(three_shot_prompt)


Post:

 Eyes wide open in the night, fearful shadows dancing on the walls, the mind a prisoner of imagined horrors. 

What is the sentiment from the text?
fearful



Post:

 Dancing through life with the exuberance of a carefree spirit, embracing joy and zest at every turn. 

What is the sentiment from the text?

Post:

Despite meticulous training, the swimmer faces disappointment as a split-second miscalculation costs them the lead in a crucial race. 

What is the sentiment from the text?
miscalculation



Post:

 Dancing through life with the exuberance of a carefree spirit, embracing joy and zest at every turn. 

What is the sentiment from the text?

Post:

Sorrowful echoes, a symphony of pain played by the strings of loss. 

What is the sentiment from the text?
sorrow



Post:

 Dancing through life with the exuberance of a carefree spirit, embracing joy and zest at every turn. 

What is the sentiment from the text?

Post:

 Heartfelt sadness after bidding farewell to a dear frien

In [15]:
Sentiment = dataset['test'][example_index]['Sentiment']

input = tokenizer(three_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        input['input_ids'],
        max_new_tokens=50
    )[0],
    skip_special_tokens=True
)

print(dash_line)
print(f"BASELINE GUIDED Sentiment:\n{Sentiment}\n")
print(dash_line)
print(f"MODEL GENERATION - One-shot:\n{output}")

---------------------------------------------------------------------------------------------------
BASELINE GUIDED Sentiment:
zest

---------------------------------------------------------------------------------------------------
MODEL GENERATION - One-shot:
positive


In [16]:
generate_config = GenerationConfig(max_new_token=20, do_sample=True, temperature=0.1, top_k=1)

input = tokenizer(three_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        input['input_ids'],
        generation_config=generate_config
    )[0],
    skip_special_tokens=True
)

print(dash_line)
print(f"BASELINE GUIDED SENTIMENT:\n{Sentiment}\n")
print(dash_line)
print(f"MODEL GENERATION - Three-shot:\n{output}")



---------------------------------------------------------------------------------------------------
BASELINE GUIDED SENTIMENT:
zest

---------------------------------------------------------------------------------------------------
MODEL GENERATION - Three-shot:
positive


In [17]:
def tokenize_function(example):
    start_prompt = 'What is the Sentiment from the post?\n\n'
    end_prompt = '\n\nSentiment: '
    prompt = [start_prompt + post + end_prompt for post in example["Text"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["Sentiment"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

In [18]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset.column_names)


Map:   0%|          | 0/658 [00:00<?, ? examples/s]

Map: 100%|██████████| 658/658 [00:00<00:00, 2292.74 examples/s]
Map: 100%|██████████| 74/74 [00:00<00:00, 2551.83 examples/s]

{'train': ['Text', 'Sentiment', 'input_ids', 'labels'], 'test': ['Text', 'Sentiment', 'input_ids', 'labels']}





In [19]:
tokenized_dataset = tokenized_dataset.remove_columns(['Text', 'Sentiment'])
tokenized_dataset = tokenized_dataset.filter(lambda example, index: index % 100==0, with_indices=True)

Filter: 100%|██████████| 658/658 [00:00<00:00, 2464.42 examples/s]
Filter: 100%|██████████| 74/74 [00:00<00:00, 2466.85 examples/s]


In [20]:
print(f"Train: {tokenized_dataset['train'].shape}")
print(f"Test: {tokenized_dataset['test'].shape}")

Train: (7, 2)
Test: (1, 2)


In [21]:
output_dir = f'./LLM_SenAna-{str(int(time.time()))}'
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

In [22]:
trainer.train()

100%|██████████| 1/1 [3:54:06<00:00, 14046.68s/it]

{'loss': 38.25, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 14046.6946, 'train_samples_per_second': 0.001, 'train_steps_per_second': 0.0, 'train_loss': 38.25, 'epoch': 1.0}





TrainOutput(global_step=1, training_loss=38.25, metrics={'train_runtime': 14046.6946, 'train_samples_per_second': 0.001, 'train_steps_per_second': 0.0, 'train_loss': 38.25, 'epoch': 1.0})

In [23]:
instruct_model_path = "./LLM_SenAna-checkpoint-local"
trainer.model.save_pretrained(instruct_model_path)
tokenizer.save_pretrained(instruct_model_path)

('./LLM_SenAna-checkpoint-local\\tokenizer_config.json',
 './LLM_SenAna-checkpoint-local\\special_tokens_map.json',
 './LLM_SenAna-checkpoint-local\\tokenizer.json')

In [24]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./LLM_SenAna-checkpoint-local", torch_dtype=torch.bfloat16)

In [25]:
rouge = evaluate.load('rouge')

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<?, ?B/s]


In [26]:
posts = dataset['test'][0:10]['Text']
given_sentiment = dataset['test'][0:10]['Sentiment']

original_model_sentiment = []
instruct_model_sentiment = []

for _, post in enumerate(posts):
    prompt = f"""
What is the sentiment from the post?

{post}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_sentiment.append(original_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_sentiment.append(instruct_model_text_output)

zipped_summaries = list(zip(given_sentiment, original_model_sentiment, instruct_model_sentiment))

df = pd.DataFrame(zipped_summaries, columns = ['given_sentiment', 'original_model_sentiment', 'instruct_model_sentiment'])

In [27]:
original_model_results = rouge.compute(
    predictions=original_model_sentiment,
    references=given_sentiment[0:len(original_model_sentiment)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_sentiment,
    references=given_sentiment[0:len(instruct_model_sentiment)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.22857142857142856, 'rouge2': 0.0, 'rougeL': 0.2, 'rougeLsum': 0.22857142857142856}
INSTRUCT MODEL:
{'rouge1': 0.2, 'rouge2': 0.0, 'rougeL': 0.2, 'rougeLsum': 0.2}


In [28]:
improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

rouge1: -2.86%
rouge2: 0.00%
rougeL: 0.00%
rougeLsum: -2.86%


In [29]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [30]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_no_trainable_param(peft_model))

Number of trainable model params: 3538944
All model params: 251116800
1.4092820552029972% of trainable params.


In [31]:
output_dir = f'./peft-LLM_SenAna-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_dataset["train"],
)

In [32]:
peft_trainer.train()
peft_model_path = "./peft_LLM_SenAna-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

100%|██████████| 1/1 [2:11:01<00:00, 7861.49s/it]

{'loss': 38.0, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 7861.5008, 'train_samples_per_second': 0.001, 'train_steps_per_second': 0.0, 'train_loss': 38.0, 'epoch': 1.0}





('./peft_LLM_SenAna-checkpoint-local\\tokenizer_config.json',
 './peft_LLM_SenAna-checkpoint-local\\special_tokens_map.json',
 './peft_LLM_SenAna-checkpoint-local\\tokenizer.json')

In [33]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       './peft_LLM_SenAna-checkpoint-local',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [38]:
posts = dataset['test'][0:2]['Text']
sentiments = dataset['test'][0:2]['Sentiment']
original_model_sentiments = []
instruct_model_sentiments = []
peft_model_sentiments = []

for idx, post in enumerate(posts):
    prompt = f"""
What is the sentiment of the post?

{post}

Sentiment: """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    human_baseline_text_output = sentiment[idx]

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_sentiments.append(original_model_text_output)
    instruct_model_sentiments.append(instruct_model_text_output)
    peft_model_sentiments.append(peft_model_text_output)

peft_model_results = rouge.compute(
    predictions=peft_model_sentiments,
    references=sentiments[0:len(peft_model_sentiments)],
    use_aggregator=True,
    use_stemmer=True,
)
print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.22857142857142856, 'rouge2': 0.0, 'rougeL': 0.2, 'rougeLsum': 0.22857142857142856}
INSTRUCT MODEL:
{'rouge1': 0.2, 'rouge2': 0.0, 'rougeL': 0.2, 'rougeLsum': 0.2}
PEFT MODEL:
{'rouge1': 0.5, 'rouge2': 0.0, 'rougeL': 0.5, 'rougeLsum': 0.5}
