In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
from transformers import get_linear_schedule_with_warmup

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split, RandomSampler, SequentialSampler

import pandas as pd

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# model_name: ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
model_name = "gpt2-medium" 
model_save_path = '/kaggle/working/'

In [3]:
configuration = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
# Set pad_token_id to be the same as eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [6]:
input_sequence = "I am very sad"
input_ids = tokenizer.encode(input_sequence, return_tensors='pt')

In [7]:
input_ids

tensor([[  40,  716,  845, 6507]])

In [8]:
max_length = 5
padded_input_ids = torch.nn.functional.pad(input_ids, (0, max_length - input_ids.shape[1]), 
                                           value=tokenizer.pad_token_id)

In [9]:
# Create attention mask dynamically
attention_mask = (padded_input_ids != tokenizer.pad_token_id).long()

In [10]:
attention_mask

tensor([[1, 1, 1, 1, 0]])

In [11]:
padded_input_ids = padded_input_ids.to(device)
attention_mask = attention_mask.to(device)
model = model.to(device)

In [12]:
#combine both sampling techniques
sample_outputs = model.generate( input_ids=padded_input_ids,
    attention_mask=attention_mask,
    do_sample=True,
    max_length=50,
    top_k=50,
    top_p=0.85,
    num_return_sequences=3,
    pad_token_id=tokenizer.eos_token_id )

In [13]:
# Decode and print the generated texts
for i, sample_output in enumerate(sample_outputs):
    print(f"Sample {i + 1}: {tokenizer.decode(sample_output, skip_special_tokens=True)}")

Sample 1: I am very sadAs I write this I am in my apartment in Manhattan, thinking about my mother. She was so beautiful and innocent and kind. She had a heart of gold and I remember her smile as she would cry over the phone or
Sample 2: I am very sadA woman, who is trying her best to have a healthy baby'I am very sad,' she said.

'I can't bear to watch my child suffer.'

Her parents have had their daughter with her
Sample 3: I am very sadA lot has been lost" says Aravinda. The family feels the loss is irreparable and that all hope has died. "We are in mourning and trying to find a way of life for ourselves. All that


# data preparation

In [14]:
df = pd.read_csv('/kaggle/input/chat-data/chat_data.csv')

In [15]:
df.head()

Unnamed: 0,conversations,id
0,"[{'from': 'human', 'value': ""I've been feeling...",identity_0
1,"[{'from': 'human', 'value': ""Hi, I'm feeling r...",identity_1
2,"[{'from': 'human', 'value': ""Hey, I hope you'r...",identity_2
3,"[{'from': 'human', 'value': ""I'm feeling reall...",identity_3
4,"[{'from': 'human', 'value': ""I'm feeling reall...",identity_4


In [16]:
df_500 = df.head(500)

In [17]:
df_500 = df_500['conversations']

In [18]:
type(df_500[0])

str

In [19]:
df_500[0]

'[{\'from\': \'human\', \'value\': "I\'ve been feeling so sad and overwhelmed lately. Work has become such a massive source of stress for me."}\n {\'from\': \'gpt\', \'value\': "Hey there, I\'m here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what\'s been going on?"}\n {\'from\': \'human\', \'value\': "I recently got a promotion at work, which I thought would be exciting. But the added responsibilities and pressure have just taken a toll on my mental health. It\'s been a really moving experience for me."}\n {\'from\': \'gpt\', \'value\': "I can understand how it can be overwhelming when we\'re faced with higher expectations. It\'s okay to acknowledge your emotions and allow yourself to feel sad in this situation. It\'s an important part of the healing process. What specific challenges have you been facing at work?"}\n {\'from\': \'human\', \'value\': "Well, the workload has increased significantly, and I find it hard to 

In [20]:
import ast
# Function to convert string to list of dictionaries
def str_to_list_of_dicts(s):
    # Replace \n with , to properly format the list
    s = s.replace('\n ', ', ')
    
    # Wrap with square brackets if not already wrapped
    if not s.startswith('['):
        s = f'[{s}]'
    
    try:
        return ast.literal_eval(s)
    except Exception as e:
        print(f"Error converting string to list of dicts: {e}")
        return None

In [21]:
df_500_converted = df_500.apply(str_to_list_of_dicts)
#df_500_converted = str_to_list_of_dicts(df_500[0])

In [22]:
df_500_converted[0]

[{'from': 'human',
  'value': "I've been feeling so sad and overwhelmed lately. Work has become such a massive source of stress for me."},
 {'from': 'gpt',
  'value': "Hey there, I'm here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what's been going on?"},
 {'from': 'human',
  'value': "I recently got a promotion at work, which I thought would be exciting. But the added responsibilities and pressure have just taken a toll on my mental health. It's been a really moving experience for me."},
 {'from': 'gpt',
  'value': "I can understand how it can be overwhelming when we're faced with higher expectations. It's okay to acknowledge your emotions and allow yourself to feel sad in this situation. It's an important part of the healing process. What specific challenges have you been facing at work?"},
 {'from': 'human',
  'value': "Well, the workload has increased significantly, and I find it hard to maintain a work-life balance

In [23]:
#creating a function that processes each list of dictionaries to extract context-response pairs:
def process_conversation(conversation_list):
    conversations = []
    context = []
    for entry in conversation_list:
        if entry['from'] == 'gpt':
            if context:
                context_text = " ".join([turn['value'] for turn in context])
                conversations.append((context_text, entry['value']))
            context.append(entry)
        else:
            context.append(entry)
    return conversations


In [24]:
# Apply the function to each element in the Series
processed_conversations = df_500_converted.apply(process_conversation)




In [25]:
processed_conversations[0]

[("I've been feeling so sad and overwhelmed lately. Work has become such a massive source of stress for me.",
  "Hey there, I'm here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what's been going on?"),
 ("I've been feeling so sad and overwhelmed lately. Work has become such a massive source of stress for me. Hey there, I'm here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what's been going on? I recently got a promotion at work, which I thought would be exciting. But the added responsibilities and pressure have just taken a toll on my mental health. It's been a really moving experience for me.",
  "I can understand how it can be overwhelming when we're faced with higher expectations. It's okay to acknowledge your emotions and allow yourself to feel sad in this situation. It's an important part of the healing process. What specific challenges have you been f

In [26]:
# Flatten the list of lists into a single list of context-response pairs
flat_conversations = [item for sublist in processed_conversations for item in sublist]

In [27]:
flat_conversations[4]

("I've been feeling so sad and overwhelmed lately. Work has become such a massive source of stress for me. Hey there, I'm here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what's been going on? I recently got a promotion at work, which I thought would be exciting. But the added responsibilities and pressure have just taken a toll on my mental health. It's been a really moving experience for me. I can understand how it can be overwhelming when we're faced with higher expectations. It's okay to acknowledge your emotions and allow yourself to feel sad in this situation. It's an important part of the healing process. What specific challenges have you been facing at work? Well, the workload has increased significantly, and I find it hard to maintain a work-life balance. I've been staying late at the office, and it feels like I'm constantly under a pile of never-ending tasks. It's just so hard to keep up, and it's impacting my 

In [28]:
# converting to a dataframe
conversation_df = pd.DataFrame(flat_conversations, columns=['context', 'response'])


In [29]:
conversation_df

Unnamed: 0,context,response
0,I've been feeling so sad and overwhelmed latel...,"Hey there, I'm here to listen and support you...."
1,I've been feeling so sad and overwhelmed latel...,I can understand how it can be overwhelming wh...
2,I've been feeling so sad and overwhelmed latel...,It sounds like you're dealing with a lot of pr...
3,I've been feeling so sad and overwhelmed latel...,It's great to hear that you're already impleme...
4,I've been feeling so sad and overwhelmed latel...,"It's completely normal to feel that way, but r..."
...,...,...
4101,"Hey Alex, I'm feeling really confused and sexu...","That's a fantastic approach, Charlie. Building..."
4102,"Hey Alex, I'm feeling really confused and sexu...","That's great to hear, Charlie. Opening up abou..."
4103,"Hey Alex, I'm feeling really confused and sexu...","Wonderful, Charlie. Self-exploration is a cruc..."
4104,"Hey Alex, I'm feeling really confused and sexu...","That's an excellent approach, Charlie. Mindful..."


In [30]:
df_200=df.tail(200)
df_200

Unnamed: 0,conversations,id
98886,"[{'from': 'human', 'value': ""Hi Alex, I'm feel...",identity_99270
98887,"[{'from': 'human', 'value': ""I'm afraid, Alex....",identity_99271
98888,"[{'from': 'human', 'value': ""Hi Alex, I've bee...",identity_99272
98889,"[{'from': 'human', 'value': ""Hey Alex, I'm so ...",identity_99273
98890,"[{'from': 'human', 'value': ""Hi Alex. I hope y...",identity_99274
...,...,...
99081,"[{'from': 'human', 'value': ""Hi Alex, I hope y...",identity_99465
99082,"[{'from': 'human', 'value': ""I'm feeling reall...",identity_99466
99083,"[{'from': 'human', 'value': ""I'm feeling regre...",identity_99467
99084,"[{'from': 'human', 'value': ""Hi Alex, I'm feel...",identity_99468


In [31]:
df_200 = df_200['conversations']


In [32]:
type(df_200[98886])

str

In [33]:
df_200[98886]

'[{\'from\': \'human\', \'value\': "Hi Alex, I\'m feeling really sad lately. I just went through a breakup and it\'s been really hard for me to cope."}\n {\'from\': \'gpt\', \'value\': "Hi Charlie, I\'m really sorry to hear that you\'re feeling sad after going through a breakup. Breakups can be incredibly difficult and I\'m here to support you. When did the breakup happen?"}\n {\'from\': \'human\', \'value\': \'Thanks, Alex. The breakup happened about a month ago, but it still feels fresh and painful. I find myself constantly replaying the memories and feeling overwhelmed with sadness.\'}\n {\'from\': \'gpt\', \'value\': "It\'s completely understandable that the pain is still raw for you. Breakups can bring up a lot of emotions and it takes time to heal. What are some specific situations or triggers that make you feel particularly sad?"}\n {\'from\': \'human\', \'value\': "Whenever I pass by the places we used to go together or when I see couples holding hands, it hits me hard. It feel

In [34]:
df_200_converted = df_200.apply(str_to_list_of_dicts)


In [35]:
df_200_converted[98886]

[{'from': 'human',
  'value': "Hi Alex, I'm feeling really sad lately. I just went through a breakup and it's been really hard for me to cope."},
 {'from': 'gpt',
  'value': "Hi Charlie, I'm really sorry to hear that you're feeling sad after going through a breakup. Breakups can be incredibly difficult and I'm here to support you. When did the breakup happen?"},
 {'from': 'human',
  'value': 'Thanks, Alex. The breakup happened about a month ago, but it still feels fresh and painful. I find myself constantly replaying the memories and feeling overwhelmed with sadness.'},
 {'from': 'gpt',
  'value': "It's completely understandable that the pain is still raw for you. Breakups can bring up a lot of emotions and it takes time to heal. What are some specific situations or triggers that make you feel particularly sad?"},
 {'from': 'human',
  'value': "Whenever I pass by the places we used to go together or when I see couples holding hands, it hits me hard. It feels like I've lost a part of my

In [36]:
processed_conversations_200 = df_200_converted.apply(process_conversation)

In [37]:
processed_conversations_200[98886]

[("Hi Alex, I'm feeling really sad lately. I just went through a breakup and it's been really hard for me to cope.",
  "Hi Charlie, I'm really sorry to hear that you're feeling sad after going through a breakup. Breakups can be incredibly difficult and I'm here to support you. When did the breakup happen?"),
 ("Hi Alex, I'm feeling really sad lately. I just went through a breakup and it's been really hard for me to cope. Hi Charlie, I'm really sorry to hear that you're feeling sad after going through a breakup. Breakups can be incredibly difficult and I'm here to support you. When did the breakup happen? Thanks, Alex. The breakup happened about a month ago, but it still feels fresh and painful. I find myself constantly replaying the memories and feeling overwhelmed with sadness.",
  "It's completely understandable that the pain is still raw for you. Breakups can bring up a lot of emotions and it takes time to heal. What are some specific situations or triggers that make you feel partic

In [38]:
flat_conversations_200 = [item for sublist in processed_conversations_200 for item in sublist]


In [39]:
flat_conversations_200[4]

("Hi Alex, I'm feeling really sad lately. I just went through a breakup and it's been really hard for me to cope. Hi Charlie, I'm really sorry to hear that you're feeling sad after going through a breakup. Breakups can be incredibly difficult and I'm here to support you. When did the breakup happen? Thanks, Alex. The breakup happened about a month ago, but it still feels fresh and painful. I find myself constantly replaying the memories and feeling overwhelmed with sadness. It's completely understandable that the pain is still raw for you. Breakups can bring up a lot of emotions and it takes time to heal. What are some specific situations or triggers that make you feel particularly sad? Whenever I pass by the places we used to go together or when I see couples holding hands, it hits me hard. It feels like I've lost a part of myself and I'm not sure how to move forward without them. It sounds like those reminders serve as painful triggers for you. Let's shift our focus to exploring ways

In [40]:
conversation_df_200 = pd.DataFrame(flat_conversations_200, columns=['context', 'response'])


In [41]:
conversation_df_200

Unnamed: 0,context,response
0,"Hi Alex, I'm feeling really sad lately. I just...","Hi Charlie, I'm really sorry to hear that you'..."
1,"Hi Alex, I'm feeling really sad lately. I just...",It's completely understandable that the pain i...
2,"Hi Alex, I'm feeling really sad lately. I just...",It sounds like those reminders serve as painfu...
3,"Hi Alex, I'm feeling really sad lately. I just...","I hear you, Charlie. Loss and heartbreak can m..."
4,"Hi Alex, I'm feeling really sad lately. I just...","That's a great mindset to have, Charlie. Takin..."
...,...,...
1591,"I've been feeling really depressed lately, and...",The fact that your girlfriend cares deeply abo...
1592,"I've been feeling really depressed lately, and...","Trauma can often be a factor in addiction, Cha..."
1593,"I've been feeling really depressed lately, and...",It's completely understandable that addiction ...
1594,"I've been feeling really depressed lately, and...","Charlie, your determination to overcome addict..."


In [42]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token 

In [43]:
def preprocess_text(text):
    return tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

conversation_df['context_ids'] = conversation_df['context'].apply(lambda x: preprocess_text(x)['input_ids'])
conversation_df['response_ids'] = conversation_df['response'].apply(lambda x: preprocess_text(x)['input_ids'])

In [44]:
conversation_df

Unnamed: 0,context,response,context_ids,response_ids
0,I've been feeling so sad and overwhelmed latel...,"Hey there, I'm here to listen and support you....","[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(10814), tensor(612), tensor(11), tens..."
1,I've been feeling so sad and overwhelmed latel...,I can understand how it can be overwhelming wh...,"[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(40), tensor(460), tensor(1833), tenso..."
2,I've been feeling so sad and overwhelmed latel...,It sounds like you're dealing with a lot of pr...,"[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(1026), tensor(5238), tensor(588), ten..."
3,I've been feeling so sad and overwhelmed latel...,It's great to hear that you're already impleme...,"[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(1026), tensor(338), tensor(1049), ten..."
4,I've been feeling so sad and overwhelmed latel...,"It's completely normal to feel that way, but r...","[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(1026), tensor(338), tensor(3190), ten..."
...,...,...,...,...
4101,"Hey Alex, I'm feeling really confused and sexu...","That's a fantastic approach, Charlie. Building...","[[tensor(10814), tensor(4422), tensor(11), ten...","[[tensor(2504), tensor(338), tensor(257), tens..."
4102,"Hey Alex, I'm feeling really confused and sexu...","That's great to hear, Charlie. Opening up abou...","[[tensor(10814), tensor(4422), tensor(11), ten...","[[tensor(2504), tensor(338), tensor(1049), ten..."
4103,"Hey Alex, I'm feeling really confused and sexu...","Wonderful, Charlie. Self-exploration is a cruc...","[[tensor(10814), tensor(4422), tensor(11), ten...","[[tensor(42337), tensor(913), tensor(11), tens..."
4104,"Hey Alex, I'm feeling really confused and sexu...","That's an excellent approach, Charlie. Mindful...","[[tensor(10814), tensor(4422), tensor(11), ten...","[[tensor(2504), tensor(338), tensor(281), tens..."


In [45]:
def preprocess_text(text):
    return tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

conversation_df_200['context_ids'] = conversation_df_200['context'].apply(lambda x: preprocess_text(x)['input_ids'])
conversation_df_200['response_ids'] = conversation_df_200['response'].apply(lambda x: preprocess_text(x)['input_ids'])

In [46]:
conversation_df_200

Unnamed: 0,context,response,context_ids,response_ids
0,"Hi Alex, I'm feeling really sad lately. I just...","Hi Charlie, I'm really sorry to hear that you'...","[[tensor(17250), tensor(4422), tensor(11), ten...","[[tensor(17250), tensor(11526), tensor(11), te..."
1,"Hi Alex, I'm feeling really sad lately. I just...",It's completely understandable that the pain i...,"[[tensor(17250), tensor(4422), tensor(11), ten...","[[tensor(1026), tensor(338), tensor(3190), ten..."
2,"Hi Alex, I'm feeling really sad lately. I just...",It sounds like those reminders serve as painfu...,"[[tensor(17250), tensor(4422), tensor(11), ten...","[[tensor(1026), tensor(5238), tensor(588), ten..."
3,"Hi Alex, I'm feeling really sad lately. I just...","I hear you, Charlie. Loss and heartbreak can m...","[[tensor(17250), tensor(4422), tensor(11), ten...","[[tensor(40), tensor(3285), tensor(345), tenso..."
4,"Hi Alex, I'm feeling really sad lately. I just...","That's a great mindset to have, Charlie. Takin...","[[tensor(17250), tensor(4422), tensor(11), ten...","[[tensor(2504), tensor(338), tensor(257), tens..."
...,...,...,...,...
1591,"I've been feeling really depressed lately, and...",The fact that your girlfriend cares deeply abo...,"[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(464), tensor(1109), tensor(326), tens..."
1592,"I've been feeling really depressed lately, and...","Trauma can often be a factor in addiction, Cha...","[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(15721), tensor(7487), tensor(460), te..."
1593,"I've been feeling really depressed lately, and...",It's completely understandable that addiction ...,"[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(1026), tensor(338), tensor(3190), ten..."
1594,"I've been feeling really depressed lately, and...","Charlie, your determination to overcome addict...","[[tensor(40), tensor(1053), tensor(587), tenso...","[[tensor(37136), tensor(11), tensor(534), tens..."


In [47]:
# Create a Dataset and DataLoader
import torch
from torch.utils.data import Dataset, DataLoader

class ConversationalDataset(Dataset):
    def __init__(self, contexts, responses):
        self.contexts = contexts
        self.responses = responses

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        response = self.responses[idx]
        return {'context': context, 'response': response}

dataset = ConversationalDataset(conversation_df['context_ids'].tolist(), conversation_df['response_ids'].tolist())
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [48]:
#set up the model
from transformers import GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to(device)
num_epochs=3
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader) * num_epochs
    
)

def compute_accuracy(preds, labels):
    _, preds_max = torch.max(preds, dim=-1)
    non_ignore = labels != -100  # Ignore padding index for labels
    num_tokens = non_ignore.sum().item()
    correct = (preds_max == labels) & non_ignore
    accuracy = correct.sum().item() / num_tokens
    return accuracy




In [49]:
for epoch in range(num_epochs):
    
    model.train()
    total_loss = 0
    total_accuracy=0
    for batch in dataloader:
        if isinstance(batch['context'], list):  # Check if it's a list of tensors
            context_ids = torch.cat([tensor for tensor in batch['context']], dim=0).to(device)
            response_ids = torch.cat([tensor for tensor in batch['response']], dim=0).to(device)
        else:  # If it's already a tensor
            context_ids = batch['context'].to(device)
            response_ids = batch['response'].to(device)

        inputs = {
            'input_ids': context_ids,
            'labels': response_ids
        }
    
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        logits = outputs.logits
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        
        accuracy = compute_accuracy(logits, response_ids)
        total_accuracy += accuracy

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {avg_loss:.4f} - Accuracy: {avg_accuracy:.4f}")
#     print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {avg_loss:.4f}")


Epoch 1/3 - Loss: 0.6388 - Accuracy: 0.9016
Epoch 2/3 - Loss: 0.6019 - Accuracy: 0.9031
Epoch 3/3 - Loss: 0.5899 - Accuracy: 0.9034


In [50]:
pip install rouge-score


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=20037ba63406c2d803b8de827b91bd71ed65b7ca8140d6446f0d857c3bdeca0a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [51]:
pip install bert-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0meta [36m0:00:01[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [52]:
import torch
from datasets import load_metric
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import bert_score

class ConversationalDataset(Dataset):
    def __init__(self, contexts, responses):
        self.contexts = contexts
        self.responses = responses

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        response = self.responses[idx]
        return {'context': context, 'response': response}

# Load your evaluation dataset
# Assuming conversation_df_200 is a DataFrame containing 'context_ids' and 'response_ids'
eval_dataset = ConversationalDataset(conversation_df_200['context_ids'].tolist(), conversation_df_200['response_ids'].tolist())
eval_dataloader = DataLoader(eval_dataset, batch_size=2, shuffle=False)


In [53]:
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium", padding_side='left')


In [54]:
def generate_predictions(model, tokenizer, dataloader, device):
    model.eval()
    predictions = []
    references = []
    
    for batch in dataloader:
        context_ids = batch['context'].to(device)
        response_ids = batch['response'].to(device)

        # Remove extra dimension if present
        if context_ids.dim() == 3 and context_ids.size(1) == 1:
            context_ids = context_ids.squeeze(1)  # Remove the extra dimension

        if response_ids.dim() == 3 and response_ids.size(1) == 1:
            response_ids = response_ids.squeeze(1)  # Remove the extra dimension

        # Create attention masks (if necessary)
        if tokenizer.pad_token_id is not None:
            attention_mask = (context_ids != tokenizer.pad_token_id).long().to(device)
        else:
            attention_mask = None

        with torch.no_grad():
            outputs = model.generate(
                input_ids=context_ids,
                attention_mask=attention_mask,  # Pass attention_mask if created
                max_new_tokens=50,
                num_beams=5,
                early_stopping=True
            )

        # Decode the generated and reference texts
        predictions.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
        references.extend(tokenizer.batch_decode(response_ids, skip_special_tokens=True))
    
    return predictions, references


In [58]:
model_save_path = "/kaggle/working/"

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('/kaggle/working/tokenizer_config.json',
 '/kaggle/working/special_tokens_map.json',
 '/kaggle/working/vocab.json',
 '/kaggle/working/merges.txt',
 '/kaggle/working/added_tokens.json',
 '/kaggle/working/tokenizer.json')

In [59]:
from IPython.display import FileLink

model_save_path = '/kaggle/working/model.safetensors'
FileLink(model_save_path)


In [55]:
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium", padding_side='left')


In [56]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*attention mask.*")
warnings.filterwarnings("ignore", category=UserWarning, message=".*pad token id.*")
warnings.filterwarnings("ignore", category=UserWarning, message=".*decoder-only architecture.*")
 
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')

# Generate predictions
eval_predictions, eval_references = generate_predictions(model, tokenizer, eval_dataloader, device)

# Calculate ROUGE-L Score
rouge = load_metric("rouge")
rouge_l_score = rouge.compute(predictions=eval_predictions, references=eval_references, rouge_types=["rougeL"])["rougeL"].mid.fmeasure

# Calculate BERTScore
bert_scores = bert_score.score(eval_predictions, eval_references, lang="en", rescale_with_baseline=True)
bert_f1_score = bert_scores[2].mean().item()

print(f"ROUGE-L Score: {rouge_l_score:.4f}")
print(f"BERTScore (F1): {bert_f1_score:.4f}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation

KeyboardInterrupt: 

In [None]:
def predict_function(texts):
    inputs = tokenize_texts(texts)
    
    # Ensure inputs are on the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=10,  # Reduce number of new tokens
            num_beams=1,  # Reduce number of beams
            early_stopping=True
        )
        
    return outputs.cpu().numpy()  # Convert to CPU before returning

# Example usage
sample_index = 0  # Index of the sample you want to explain
sample_context = conversation_df_200['context'].iloc[sample_index]
sample_response = conversation_df_200['response'].iloc[sample_index]

# Generate explanations
explanation = explainer.explain_instance(sample_context, predict_function, num_features=10, labels=[0], num_samples=500)

# Display the explanation
print(f"Original text: {sample_context}")
