In [4]:
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration
import argparse, os, string, sys
import torch
from pathlib import Path
import random 

from torch.optim import AdamW
import argparse, os, string, sys
import torch
from pathlib import Path
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from tqdm import tqdm
import numpy as np

from peft import PromptEmbedding, PromptTuningConfig, PrefixTuningConfig
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model

import json

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:

class ConvoDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, num_virtual_tokens=None):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_vitual_tokens = num_virtual_tokens

    def __len__(self):
        return len(self.texts)

    def create_text(self, idx):
        """
        Extracts a segment of the chat up to a randomly selected system message 
        occurring after the third message in the conversation, along with the target system message.
        
        Parameters:
        - idx: Index of the conversation in the dataset.
        
        Returns:
        - processed_chat: A string containing the conversation up to the target system message.
        - target_message: The selected target system message.
        """
        
        chat = []
        sys_idx = []

        # Attempt to load the JSON data safely
        try:
            dialog = json.loads(self.texts[idx]['text'])['dialog']
        except (KeyError, ValueError) as e:
            print(f"Error loading conversation data: {e}")
            return "", ""


        for e, i in enumerate(dialog):
            if i['speaker'] == 'usr':
                chat.append(i['text'])
            if i['speaker'] == 'sys':
                sys_idx.append(e)
                chat.append(f"[{i['strategy']}] " + i['text'])
            continue
    
        target_idx = int(random.choice(sys_idx))
        while target_idx < 4:
            target_idx = int(random.choice(sys_idx))

        #target_idx = sys_idx[3]
        processed_chat = "\n".join([c for e, c in enumerate(chat[:target_idx]) if e not in sys_idx])

        # print(processed_chat + "\n\n")
        #print(chat[target_idx])

        return  "Provide suggestions, affirmations or reflection of feelings for the following person's needs. "+processed_chat, chat[target_idx]

    def __getitem__(self, idx):
        processed_chat, target_message = self.create_text(idx)

        # Tokenize the processed chat
        inputs = self.tokenizer(
            processed_chat,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors='pt',
        )

        # Tokenize the target message
        targets = self.tokenizer(
            target_message,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors='pt',
        )

        # Since the model expects labels to calculate loss, create labels by shifting the targets to the right
        # This will be automatically handled if using a model that supports labels (like T5 or BART from Hugging Face)
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten(),
        }

        return inputs

from torch.utils.data import Dataset
import torch
import json
import random
from torch.nn.utils.rnn import pad_sequence

class CounselingDataset(Dataset):
    def __init__(self, texts, responses, tokenizer, max_len, num_virtual_tokens=None):
        self.texts = texts
        self.responses = responses
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_virtual_tokens = num_virtual_tokens

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        context = self.texts[idx]
        response = self.responses[idx]
        prompt = "Provide counseling, suggestions and emotional support for this person:"

        # Tokenize the processed chat
        inputs = self.tokenizer(
            prompt + context,
            add_special_tokens=True,
            max_length=self.max_len,
            #padding="max_length",
            truncation=True,
            return_tensors='pt',
        )

        # Tokenize the target message
        targets = self.tokenizer(
            response,
            add_special_tokens=True,
            max_length=self.max_len,
            #padding="max_length",
            truncation=True,
            return_tensors='pt',
        )

        #print(f"Input sequence length: {inputs['input_ids'].shape[1]}")
        #print(f"Target sequence length: {targets['input_ids'].shape[1]}")

        # Calculate the number of padding zeros needed
        num_padding_zeros = self.num_virtual_tokens

        # Create a tensor of zeros for padding the attention mask
        zeros_padding = torch.zeros(num_padding_zeros, dtype=torch.long).to(inputs['attention_mask'].device)

        # Concatenate the zeros to the beginning of the attention mask tensor
        #adjusted_attention_mask = torch.cat([zeros_padding, inputs['attention_mask'].flatten()], dim=0)
        adjusted_inputs = torch.cat([zeros_padding, inputs['input_ids'].flatten()], dim=0)
        return {
            'input_ids': adjusted_inputs,
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten(),
        }
        
def dynamic_padding_collate_fn(batch):
    # Separate the components of the batch
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]
    
    # Dynamically pad the sequences based on the longest sequence in the batch
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is ignored by your model's loss function
    
    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded,
        'labels': labels_padded
    }




In [38]:
import gc
torch.cuda.empty_cache()
gc.collect()

def read_data():
    # Load pre-trained DistilBERT model and tokenizer
    # tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', output_attentions=True)
    # model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2, output_attentions=True)
    prompt_tuning_init_text = "Provide counseling, suggestions and emotional support for this person:"#"Provide suggestions, affirmations or reflection of feelings for the following person's needs. "
    tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small",legacy=False)
    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
    #tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
    #tokenizer.pad_token = tokenizer.eos_token
    #model = GPT2LMHeadModel.from_pretrained("distilgpt2")

    prompt_config = PromptTuningConfig(
    peft_type="PROMPT_TUNING",
    task_type="CAUSAL_LM",
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    token_dim=768,
    num_transformer_submodules=1,
    num_attention_heads=12,
    num_layers=12,
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path="t5-small",
    )
    
    num_virtual_tokens = 20
    prefix_config = PrefixTuningConfig(
    peft_type="PROMPT_TUNING",
    task_type="CAUSAL_LM",
    num_virtual_tokens=num_virtual_tokens,
    token_dim=768,
    num_transformer_submodules=1,
    num_attention_heads=12,
    num_layers=12,
    encoder_hidden_size=768,
    )
    
    peft_model = get_peft_model(model, prefix_config)
    peft_model.print_trainable_parameters()

    # Define optimizer and learning rate
    optimizer = AdamW(model.parameters(), lr=opts.lr)
    
    # Define training parameters
    epochs = opts.epochs
    batch_size = opts.batchsize

    max_len = 512  # Maximum sequence length for DistilBERT

    # Move model to appropriate device (GPU if available, else CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    peft_model.to(device)

    dataset = load_dataset("Amod/mental_health_counseling_conversations")["train"]
    train_texts, val_texts, train_labels, val_labels = train_test_split(dataset['Context'], dataset['Response'], test_size=0.2, random_state=42)

    train_dataset = CounselingDataset(train_texts, train_labels, tokenizer, max_len, num_virtual_tokens)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_padding_collate_fn)
    
    val_dataset = CounselingDataset(val_texts, val_labels, tokenizer, max_len, num_virtual_tokens)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=dynamic_padding_collate_fn)

    lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_loader) * epochs),
    )
    # Iterate over epochs
    for epoch in range(epochs):
        peft_model.train()
        total_loss = 0

        # Wrap train_loader with tqdm
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch')):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = peft_model(input_ids, labels=labels,  attention_mask=attention_mask,)
            # print(dataset["Context"][0])
            inputs = tokenizer(dataset["Context"][0], return_tensors="pt")
            # print(inputs)
           
            # if batch_idx == 10:
            #     print(model.base_model.prepare_inputs_for_generation)
            #     output_token_ids = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device))
            #     decoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_token_ids]
            #     for text in decoded_texts:
            #         print("Context \n", text)
            #     decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]
            #     for label in decoded_labels:
            #         print("Predicted Response \n",  label)
            #     print("Real Response \n", dataset["Response"][0])
            #     break

            loss = outputs.loss
            total_loss += loss.item()

        """
        inputs = tokenizer(prompt_tuning_init_text + dataset["Context"][0], return_tensors="pt")
        #print(model.base_model.prepare_inputs_for_generation)
        output_token_ids = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device))
        decoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_token_ids]
        print("Context \n", dataset["Context"][0])
        decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]
        for label in decoded_labels:
            print("Predicted Response \n",  label)
        print("Real Response \n", dataset["Response"][0])
        """
        
        avg_train_loss = total_loss / len(train_loader)

        print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}')


class Hyperparameters():
    def __init__(self):
        self.inputfile = os.path.join('data', 'input', 'ddr.csv')
        self.epochs = 100
        self.batchsize = 8
        self.lr = 5e-6

opts = Hyperparameters()        
read_data()
    

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


trainable params: 368,640 || all params: 60,875,264 || trainable%: 0.6055661623085528


Epoch 1/100:   0%|                                                                                                           | 0/352 [00:00<?, ?batch/s]


ValueError: There should be 4 past states. 2 (past / key) for cross attention. Got 2 past key / value states

-b 2
Context 
 I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.
   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.
   How can I change my feeling of being worthless to everyone?
Predicted Response 
 It sounds like you may be asking two different questions.With regard to what you said about your husband dressing as a female in your bedroom, I wonder if you would consider asking him more about this. If you choose to do that, I would suggest that you ask him whether a certain time is a good time to have a conversation and asking questions for five minutes or more that are related to you learning more about his experience. This can be difficult to do at times, particularly when you may want to offer your own opinions or become very anxious or of type. Consider thinking of a phrase that may help you to stay calm during the discussion. It may be helpful to think of yourself as asking questions as if you were an investigative reporter and using questions that start with words like "what, how, who, where, when." Questions that start with "why," can be very difficult to answer for some people and can be overwhelming because it often links to answers involving emotions that may or may not be understood. Also try restating what your husband is saying to make sure that you are understanding correctly. If what he is telling you is different than what you have heard or thought of for many years, it may be challenging to follow his meaning initially. Remember that listening to your husband does not imply agreement with what he is saying, just that you are following and looking to understand what he is experiencing. I also recommend sticking to one topic for the conversation, but this could be done with many different topics over time.You could also see if he would be willing to have a discussion where he listens like an investigative reporter to learn more about the experience that you are having.As far as what you mentioned about the sexual experience, maybe if you can discuss what it is that you don't like and/or understand what it is that he does like, you could see if there is some middle ground here. It depends on what you both prefer.These types of conversations can be difficult to have for some couples, at least initially. Having structured conversations, such as the ones I've described briefly above, can feel awkward initially, but the reason it can be helpful is because it can lead to further understanding in a way that decreases the chances of having an argument.Also consider seeing a therapist in your area who specializes in couples to discuss some of these ideas.
Real Response 
 If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media.  Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that this is somehow terrible.Bad feelings are part of living.  They are the motivation to remove ourselves from situations and relationships which do us more harm than good.Bad feelings do feel terrible.   Your feeling of worthlessness may be good in the sense of motivating you to find out that you are much better than your feelings today.
Epoch 1/1, Training Loss: 18.4784



Context 


 I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.
   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.
   How can I change my feeling of being worthless to everyone?

   
Predicted Response 


 Firstly, I would like to say how amazing that you have been able to live through breast cancer, sexual abuse as well as your depression and anxiety. You are stronger than you may believe at this moment for being able to continue to grow and live through these experiences. Take a moment to acknowledge your strength and know that this strength will be an asset during the therapeutic process that you will be able to tap into to find increased self esteem and more strength to address the concerns you are having. There are never too many concerns to address in therapy sessions. I hope you can find a great therapist who you can trust to listen and work with you to identify and address the most concerning issues first. Once you begin to do this you may find an alleviation of symptoms and feelings which could allow you to grow your capacity to manage difficult feelings and situations and address additional concerns. Before you even realize it the issues you are having will begin to feel more manageable.

 
Real Response 


 If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media.  Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that this is somehow terrible.Bad feelings are part of living.  They are the motivation to remove ourselves from situations and relationships which do us more harm than good.Bad feelings do feel terrible.   Your feeling of worthlessness may be good in the sense of motivating you to find out that you are much better than your feelings today.
Epoch 1/1, Training Loss: 18.5471

In [6]:


################## Developing Summarization ###################

# from transformers import BartTokenizer, BartForConditionalGeneration
# dataset = load_dataset("Amod/mental_health_counseling_conversations")
# print(dataset)
# Load BART
"""
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
model.to(device)
print("loaded model")

chat = []
sys_idx = []
for e, i in enumerate(json.loads(dataset['test'][0]['text'])['dialog']):
    if i['speaker'] == 'usr':
        #chat.append('[Target] ' + i['text'])  
        chat.append(i['text'])
    if i['speaker'] == 'sys':
        #if e > 3:
        #    sys_idx.append(e)
        #chat.append(f" [Context] [{i['strategy']}] " + i['text'])
       continue
    
# target_idx = int(random.choice(sys_idx))
# processed_chat = "\n".join(chat[:target_idx])

# print(processed_chat + "\n\n")
#print(chat[target_idx])
processed_chat = "\n".join(chat)
# Tokenize and generate summary
inputs = tokenizer(processed_chat, return_tensors="pt", max_length=1024, truncation=True)
inputs.to(device)
summary_ids = model.generate(inputs['input_ids'], max_length=200, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary)"""

'\ntokenizer = BartTokenizer.from_pretrained(\'facebook/bart-large\')\nmodel = BartForConditionalGeneration.from_pretrained(\'facebook/bart-large\')\nmodel.to(device)\nprint("loaded model")\n\nchat = []\nsys_idx = []\nfor e, i in enumerate(json.loads(dataset[\'test\'][0][\'text\'])[\'dialog\']):\n    if i[\'speaker\'] == \'usr\':\n        #chat.append(\'[Target] \' + i[\'text\'])  \n        chat.append(i[\'text\'])\n    if i[\'speaker\'] == \'sys\':\n        #if e > 3:\n        #    sys_idx.append(e)\n        #chat.append(f" [Context] [{i[\'strategy\']}] " + i[\'text\'])\n       continue\n    \n# target_idx = int(random.choice(sys_idx))\n# processed_chat = "\n".join(chat[:target_idx])\n\n# print(processed_chat + "\n\n")\n#print(chat[target_idx])\nprocessed_chat = "\n".join(chat)\n# Tokenize and generate summary\ninputs = tokenizer(processed_chat, return_tensors="pt", max_length=1024, truncation=True)\ninputs.to(device)\nsummary_ids = model.generate(inputs[\'input_ids\'], max_length=200