In [None]:
!pip install -q transformers evaluate rouge-score nltk

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/84.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
!pip install -q pytorch-lightning wandb -q

In [None]:
!pip install transformers[torch] -q
!pip install accelerate -U -q
!pip install torch -q
!pip install datasets -q


!pip install --upgrade transformers -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = "../post-partum.json"

# Load the dataset

In [None]:
import json

# Opening JSON file
temp_file = open(dataset_path)

# returns JSON object as
# a dictionary
json_dataset = json.load(temp_file)

# Iterating through the json
# list
for i in json_dataset['intents']:
    print(i)

# Closing file
temp_file.close()

print(json_dataset)

{'tag': 'greeting', 'patterns': ['Hi', 'Hey', 'Is anyone there?', 'Hi there', 'Hello', 'Hey there', 'Howdy', 'Hola', 'Bonjour', 'Konnichiwa', 'Guten tag', 'Ola'], 'responses': ['Hello there. Tell me how are you feeling today?', 'Hi there. What brings you here today?', 'Hi there. How are you feeling today?', 'Great to see you. How do you feel currently?', "Hello there. Glad to see you're back. What's going on in your world right now?"]}
{'tag': 'morning', 'patterns': ['Good morning'], 'responses': ["Good morning. I hope you had a good night's sleep. How are you feeling today? "]}
{'tag': 'afternoon', 'patterns': ['Good afternoon'], 'responses': ['Good afternoon. How is your day going?']}
{'tag': 'evening', 'patterns': ['Good evening'], 'responses': ['Good evening. How has your day been?']}
{'tag': 'night', 'patterns': ['Good night'], 'responses': ['Good night. Get some proper sleep', 'Good night. Sweet dreams.']}
{'tag': 'goodbye', 'patterns': ['Bye', 'See you later', 'Goodbye', 'Au rev

# DataFrame Creation
Iterate through the json file and match each of the patterns to the responses

In [None]:
import pandas as pd

patterns = []
responses = []

for intent in json_dataset['intents']:
  for pattern in intent['patterns']:
    for response in intent['responses']:

      patterns.append(pattern)
      responses.append(response)

df = pd.DataFrame(
          {
             "patterns": patterns,
             "responses": responses
          }
      )

In [None]:
df.head()

Unnamed: 0,patterns,responses
0,Hi,Hello there. Tell me how are you feeling today?
1,Hi,Hi there. What brings you here today?
2,Hi,Hi there. How are you feeling today?
3,Hi,Great to see you. How do you feel currently?
4,Hi,Hello there. Glad to see you're back. What's g...


# Tokenization

In [None]:
model_checkpoint = 't5-small'

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
prefix = 'therapy: '

# Pre-processing function

In [None]:
max_input_length = 512
max_target_length = 128

def preprocess_function(patterns, responses):
    inputs = [prefix + pattern for pattern in patterns]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt")

    # Tokenize targets. Ensure labels are also returned as part of the model inputs.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(responses, max_length=max_target_length, truncation=True, padding=True, return_tensors="pt")

    # Update model_inputs to include labels
    # Convert tokenized labels to lists (or tensors) and include them in the model_inputs dictionary
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
patterns = df["patterns"].tolist()
responses = df["responses"].tolist()
processed_data = preprocess_function(patterns, responses)



In [None]:
processed_data

{'input_ids': tensor([[3918,   10, 2018,  ...,    0,    0,    0],
        [3918,   10, 2018,  ...,    0,    0,    0],
        [3918,   10, 2018,  ...,    0,    0,    0],
        ...,
        [3918,   10,  363,  ...,    0,    0,    0],
        [3918,   10,  363,  ...,    0,    0,    0],
        [3918,   10, 1750,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 8774,   132,     5,  ...,     0,     0,     0],
        [ 2018,   132,     5,  ...,     0,     0,     0],
        [ 2018,   132,     5,  ...,     0,     0,     0],
        ...,
        [16059,    11,  6261,  ...,   514,    12,     1],
        [18875,   655,    19,  ...,  6526,     6,     1],
        [18875,   655,    19,  ...,  6526,     6,     1]])}

# Fine-tuning the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-therapy",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=23,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from evaluate import load

metric = load("rouge")

In [None]:
import nltk
import numpy as np

nltk.download('punkt')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from datasets import Dataset

# Assuming `processed_data` is the output of your preprocessing function and is a list of dictionaries
dataset = Dataset.from_dict(processed_data)
# Optionally, split the dataset into training and evaluation datasets
train_test_split = dataset.train_test_split(test_size=0.1)  # Adjust test_size as needed
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Make sure to include this
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.246183,22.365,9.5613,20.8095,22.0762,16.0395
2,No log,0.244031,23.1533,10.1551,21.456,22.9175,15.5658
3,No log,0.242171,22.9803,10.1576,21.3576,22.8192,15.6053
4,No log,0.24022,23.2932,10.2523,21.6548,23.0323,15.4474
5,No log,0.238541,23.5008,10.31,21.6893,23.1441,15.3421
6,No log,0.236966,23.4354,10.3112,21.6682,23.0999,15.4342
7,No log,0.235576,23.5229,10.4055,21.7554,23.1861,15.3816
8,No log,0.234453,23.7437,10.2457,21.7674,23.1937,15.2632
9,No log,0.233109,23.4879,10.1908,21.6484,23.0368,15.5526
10,No log,0.232037,23.6056,10.5371,21.9022,23.0639,15.4737




TrainOutput(global_step=989, training_loss=0.2649064946343371, metrics={'train_runtime': 173.9866, 'train_samples_per_second': 90.156, 'train_steps_per_second': 5.684, 'total_flos': 116100003790848.0, 'train_loss': 0.2649064946343371, 'epoch': 23.0})

In [None]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# List of inputs
inputs = [
    'therapy: I feel sad after having a baby',
    'therapy: I am feeling overwhelmed with work',
    'therapy: What should I eat after giving birth?',
    "I'm having trouble bonding with my baby",
    "How can I manage household chores and a newborn?"
]

# Tokenize inputs as a batch
input_ids = tokenizer(inputs, return_tensors='pt', padding=True, truncation=True).to(device)

# Generate responses for each input
outputs = model.generate(
    input_ids.input_ids,
    max_length=50,  # Example hyperparameter, adjust as needed
    num_beams=5,    # Example hyperparameter, adjust as needed
    early_stopping=True  # Example hyperparameter, adjust as needed
)

# Decode and print each generated response
for i, output in enumerate(outputs):
    print(f"Input: {inputs[i]}")
    print(f"Generated response: {tokenizer.decode(output, skip_special_tokens=True)}\n")


Input: therapy: I feel sad after having a baby
Generated response: I'm sorry to hear that. I'm here for you. I'm here for you.

Input: therapy: I am feeling overwhelmed with work
Generated response: I'm sorry to hear that. What do you think is behind this?

Input: therapy: What should I eat after giving birth?
Generated response: It's important to seek advice from a nutritionist after birth. It's important to seek advice from a nutritionist to treat your condition.

Input: I'm having trouble bonding with my baby
Generated response: I'm sorry to hear that. I'm having trouble bonding with my baby.

Input: How can I manage household chores and a newborn?
Generated response: How can I manage household chores and a newborn?



In [None]:
outputs = model.generate(
    input_ids.input_ids,
    max_length=100,
    min_length=20,
    num_beams=5,
    temperature=0.7,
    top_k=50,
    top_p=0.85,
    repetition_penalty=1.2,
    length_penalty=1.0,
    no_repeat_ngram_size=2,
    early_stopping=True,
    num_return_sequences=1
)

# Decode and print each generated response
for i, output in enumerate(outputs):
    print(f"Input: {inputs[i]}")
    print(f"Generated response: {tokenizer.decode(output, skip_special_tokens=True)}\n")

Input: therapy: I feel sad after having a baby
Generated response: I'm sorry to hear that. I am here to help you with grief, anxiety and anything else you may feel at this time.

Input: therapy: I am feeling overwhelmed with work
Generated response: I'm sorry to hear that. I am here for you. Tell me why do you think you feel this way?

Input: therapy: What should I eat after giving birth?
Generated response: It's important to seek advice from a nutritionist after birth. Ask for help with your intake of vitamins and minerals.

Input: I'm having trouble bonding with my baby
Generated response: I'm sorry to hear that. I am having trouble bonding with my baby. It's important to talk to your baby about how you're feeling.

Input: How can I manage household chores and a newborn?
Generated response: How can I manage household chores and a baby's day? What do you think is the reason behind this?



In [None]:
save_path = "/content/drive/MyDrive/Github/post-partum-depression-ai/"

In [None]:
model.save_pretrained(f"{save_path}model/")
tokenizer.save_pretrained(f"{save_path}tokenizer/")

('/content/drive/MyDrive/Github/post-partum-depression-ai/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Github/post-partum-depression-ai/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Github/post-partum-depression-ai/tokenizer/spiece.model',
 '/content/drive/MyDrive/Github/post-partum-depression-ai/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/Github/post-partum-depression-ai/tokenizer/tokenizer.json')

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained(f"{save_path}model/")
tokenizer = AutoTokenizer.from_pretrained(f"{save_path}tokenizer/")