<b> 1. LOADING THE DATASET </b>

We import the dataset from the referenced open-source repository.

In [None]:
# We login to hugging face hub to import the dataset
from huggingface_hub import login
login(new_session=False,
write_permission=True,
token='...',
add_to_git_credential=True)

from datasets import load_dataset
dataset = load_dataset("Salesforce/dialogstudio", "TweetSumm") #The dataset is available in Hugginface ine the Salesforce/dialogstudio repository
dataset # We show the content of the dataset

<b> 2. DATA CLEANING <br>


The data comes from twitter conversation, hence contains informations such as usernames preceded by @, urls, or other contextual information. These elements can be identified by looking at some data samples, and removed. </b>

In [None]:
def clean_data(data):
    data = re.sub('http://\S+', '', data) #Deleting urls
    data = re.sub('https://\S+', '', data)
    data = re.sub(r"@[^\s]+", "", data) #Deleting twitter usernames preceded by @
    data = re.sub('_', ' ', data) #Deleting underscores from the text
    data = re.sub(r"\^[^ ]+", "", data) #Deleting names and initials preceded by ^
    return data

<b> 3. FORMATTING THE DATASET </b>

The dataset contains many information that will not be used during our training, so the content and the format of the dataset will be adjusted to match our needs.

In [None]:
# We use the python library Pandas to visualise the training and validation datasets
import pandas as pd

training_data = pd.DataFrame(dataset["train"]) # The training set
validation_data = pd.DataFrame(dataset["validation"]) # The validation set
testing_data = pd.DataFrame(dataset["test"]) # The validation set

In [None]:
# Only the "log" column containing the conversation, and the "original dialog info" column containing the summaries will be used. So we remove all other columns from the dataset.

training_data = training_data.drop(columns=['original dialog id', 'new dialog id','prompt','dialog index']) #Removing unecessary columns from the training data
validation_data = validation_data.drop(columns=['original dialog id', 'new dialog id','prompt','dialog index']) #Removing unecessary columns from the training data
testing_data = testing_data.drop(columns=['original dialog id', 'new dialog id','prompt','dialog index']) #Removing unecessary columns from the training data

summary_column = training_data["original dialog info"]


The log column contains several data entries, we select only the data entries we will need, "user utterance" containing the user message, and "system response" containing the agent response, and we group them into a single column to obtain the full dialog between user and agent.

In [None]:
# We create a helper function to update the content of the "log" column.
def formatting_log(column,data,index):
    column[index] = data

<b> We create a function that takes as a parameter a the "log" column from the dataset, and updates the content of every row of the column with the text dialog by using the previous helper function. </b>

In [None]:
# Function that will be used to format the dialog column of the training and validation data
import re

def format_column(column):
    i = 0
    for row in column:
        text = ""
        for turn in row: # 
            user = clean_data(turn["user utterance"]) # We select the user utterance
            agent = clean_data(turn["system response"]) # We select system response
            text += f"user:{user}\nagent:{agent}\n" # We concatenate them, and add them to the previous ones if any, which results in the full dialog
        formatting_log(column, text, i) # We change the content of the column to the text dialog
        i += 1

In [None]:
#Formatting dialog training text using the previous function
column_log = training_data["log"]
format_column(column_log)
print(column_log[0])

In [None]:
#Formatting dialog validation text using the previous function
validation_dialog = validation_data["log"]
format_column(validation_dialog)

In [None]:
#Formatting dialog testing text using the previous function
testing_dialog = testing_data["log"]
format_column(testing_dialog)

In [None]:
#We can now print the first dialog of the validation dataset to make sure that its format is correct.
print(validation_dialog[0])

In [None]:
# We create a function used to format summary columns of the training and validation datasets
import json

def format_column_summary(column):
    i=0
    for row in column:
        data = column[i]
        text = json.loads(data)
        text = text["summaries"]["abstractive_summaries"][0]
        text = " ".join(text)
        column[i] = text
        i+=1

In [None]:
# We use the previously created function to update the content of the "original dialog info" column in the training data
summary_training = training_data["original dialog info"]
format_column_summary(summary_training)

# We print the first row of the training column as a verification
print(summary_training[0])

In [None]:
# We do the same for the "original dialog info" column in the validation set
summary_validation = validation_data["original dialog info"]
format_column_summary(summary_validation)

# We print the first row of the validation column as a verification
print(summary_validation[0])

In [None]:
# We do the same for the "original dialog info" column in the testing set
summary_testing = testing_data["original dialog info"]
format_column_summary(summary_testing)

# We print the first row of the validation column as a verification
print(summary_testing[0])

In [None]:
#We rename the current columns of the dataset to "dialog" and "summary" for better understandability

training_data.rename(columns={'original dialog info': 'summary', 'log': 'dialog'}, inplace=True)
training_data = training_data[['dialog', 'summary']]

validation_data.rename(columns={'original dialog info': 'summary', 'log': 'dialog'}, inplace=True)
validation_data = validation_data[['dialog', 'summary']]

testing_data.rename(columns={'original dialog info': 'summary', 'log': 'dialog'}, inplace=True)
testing_data = testing_data[['dialog', 'summary']]

We create Data Dictionary containing all sets to be able and push it to Huggingface Hub to use it for the evaluation part.

In [None]:
# We create Data Dictionary containing all sets to be able to push it to Huggingface Hub and use it for the evaluation part.

from datasets import DatasetDict, Dataset

dataset_training = Dataset.from_pandas(training_data)
dataset_validation = Dataset.from_pandas(validation_data)
dataset_testing = Dataset.from_pandas(testing_data)

In [None]:
# We create a dataset dictionary where we store store the training and validation sets
from datasets import DatasetDict, Dataset

final_dataset = DatasetDict({
    'training': dataset_training,
    'validation' : dataset_validation,
    'testing' : dataset_testing
    })

In [None]:
# We visualise the resulting dataset
final_dataset

In [None]:
# We store the current dataset to Hugging Face Hub to use it for our the model evaluation
final_dataset.push_to_hub("Dialog-Summarization-Dataset", token="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")

<b> 4. CONVERTING THE DATASET INTO THE REQUIRED INSTRUCTION FORMAT </b>

To be able to fine-tune the Llama2-7b-chat model, we need a to convert the dataset into the required instruction format for the model. Each row in the instruction dataset should be in the format:
<s> [INST] <<SYS>> {{system_prompt}} <</SYS>> {{input}} [/INST] {{summary}} </s>
Where: {{ system_prompt }} represents the default prompt used in the dataset, {{ input }} represents the dialog to be summarised, and summary represents the corresponding summary of the dialog.
So we need to reformat the current dataset to match this specific format.

In [None]:
# We create a default_prompt that will be used as a system prompt in the instruction dataset
default_prompt = "The following text is a conversation between a user and an AI agent. Write a summary of the conversation."

In [None]:
# We convert the training set into the instruction format for the Llama2 chat model
import pandas as pd

df_training = pd.DataFrame(dataset_training)

df_training['text'] = df_training.apply(
    lambda row: f"""<s> [INST] <<SYS>> {default_prompt} <</SYS>> {row['dialog']} [/INST] {row['summary']} </s>""",axis=1
)

dataset_training = df_training[['text']]

In [None]:
# We convert the training set into the instruction format for the Llama2 chat model
df_validation = pd.DataFrame(dataset_validation)

df_validation['text'] = df_validation.apply(
    lambda row: f"""<s> [ INST] <<SYS>> {default_prompt} <</SYS>> {row['dialog']} [/INST] {row['summary']} </s>""",axis=1
)

dataset_validation = df_validation[['text']]


In [None]:
# We convert the datasets from pandas into the format required to store it  used in Hugging Face Hub, the testing set will not be used during the training
dataset_training_formatted = Dataset.from_pandas(dataset_training)
dataset_validation_formatted = Dataset.from_pandas(dataset_validation)

In [None]:
# We create a dataset dictionary where we store store the training and validation sets
from datasets import DatasetDict, Dataset

final_dataset_formatted = DatasetDict({
    'training': dataset_training_formatted,
    'validation' : dataset_validation_formatted
    })


In [None]:
# We visualise content of the dataset dictionary 
final_dataset_formatted

Our final dataset is now ready, we can now push it to Hugging Face Hub

In [None]:
# We store the final formatted dataset in Hugging Face Hub to use it for finetuning.
final_dataset_formatted.push_to_hub("Dialog-Summarization-Dataset-Formatted", token="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")

Reference: TweetSum Dataset https://huggingface.co/datasets/Salesforce/dialogstudio/tree/main/dialogue_summarization/TweetSumm