## Load Data

In [1]:
import pandas as pd

data = pd.read_csv('filtered_df.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer,question_lang,answer_lang,text
0,10,open-ended,4847676011,"Oct 17, 2013",1381993000.0,Where is this made?,,Made in USA Distributed by: Sergeant's Pet Car...,en,en,User: Where is this made?\nAssistant: Made in ...
1,11,open-ended,4847676011,"Nov 10, 2013",1384070000.0,Does this have an expiration date? Does it onl...,,"yes it does have an expiration date. However, ...",en,en,User: Does this have an expiration date? Does ...
2,12,open-ended,4847676011,"Jan 11, 2014",1389427000.0,"does this have any type of sugar, grane alchol...",,"Here is the list of ingredients: Sorbitol, hyd...",en,en,"User: does this have any type of sugar, grane ..."
3,13,yes/no,4847676011,"Aug 7, 2013",1375859000.0,Does this contain citric acid?,?,it is not listed as an ingredient.,en,en,User: Does this contain citric acid?\nAssistan...
4,14,yes/no,4847676011,"Aug 20, 2014",1408518000.0,"IS this product VEGAN, specifically the glycerin?",?,It is my understanding that this product is no...,en,en,"User: IS this product VEGAN, specifically the ..."


In [2]:
data=data[['asin','question','answer']]

In [3]:
data.head()

Unnamed: 0,asin,question,answer
0,4847676011,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...
1,4847676011,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ..."
2,4847676011,"does this have any type of sugar, grane alchol...","Here is the list of ingredients: Sorbitol, hyd..."
3,4847676011,Does this contain citric acid?,it is not listed as an ingredient.
4,4847676011,"IS this product VEGAN, specifically the glycerin?",It is my understanding that this product is no...
...,...,...,...
21960,B00JZA00HE,Does this have only one level of correction or...,It has 6 levels... But I think they are for se...
21961,B00JZA00HE,"Swimming, will it hold up to my dog swimming w...","Fully immersed in water, absolutely not. They ..."
21962,B00JZA00HE,Is this product waterproof?,We left it out in the rain on our dog and it s...
21963,B00JZA00HE,Is this too bulky for a 5 lb. Chihuahua? Will ...,It is bulky for a small dog and it is a piece ...


In [4]:
# Create dialogue text
def create_dialogue_text(row):
    return f"User: {row['question']}\nAssistant: {row['answer']}\n"

data['dialogue'] = data.apply(create_dialogue_text, axis=1)

In [5]:
from sklearn.model_selection import train_test_split
# Initialize training and test sets
train_data_list = []
test_data_list = []
# Split for each asin
for asin, group in data.groupby('asin'):
    # If sample size > 1 for this asin, split 80-20 for train/test
    if len(group) > 1:
        train_group, test_group = train_test_split(group, test_size=0.2, random_state=42)
    else:
        # If only one sample, add it to training set
        train_group = group
        test_group = pd.DataFrame(columns=group.columns)  # Empty test set portion
    
    # Add results to training and test set lists
    train_data_list.append(train_group)
    test_data_list.append(test_group)

# Combine all asin training and test sets
train_data = pd.concat(train_data_list).reset_index(drop=True)
test_data = pd.concat(test_data_list).reset_index(drop=True)

# View count comparison of each asin in training and test sets
train_asin_counts = train_data['asin'].value_counts().reset_index()
train_asin_counts.columns = ['asin', 'train_count']

test_asin_counts = test_data['asin'].value_counts().reset_index()
test_asin_counts.columns = ['asin', 'test_count']

# Merge training and test set statistics
asin_counts = pd.merge(train_asin_counts, test_asin_counts, on='asin', how='outer').fillna(0)
asin_counts['train_count'] = asin_counts['train_count'].astype(int)
asin_counts['test_count'] = asin_counts['test_count'].astype(int)

In [43]:
asin_counts.head()

Unnamed: 0,asin,train_count,test_count
0,4847676011,5,2
1,B00004X14K,7,2
2,B00006H36X,5,2
3,B00006H373,4,1
4,B00006JHRE,8,2


In [42]:
train_data.head()

Unnamed: 0,asin,question,answer,dialogue
0,4847676011,does anyone know where this is made?,Believe it or not.... the USA!,User: does anyone know where this is made?\nAs...
1,4847676011,"does this have any type of sugar, grane alchol...","Here is the list of ingredients: Sorbitol, hyd...","User: does this have any type of sugar, grane ..."
2,4847676011,"IS this product VEGAN, specifically the glycerin?",It is my understanding that this product is no...,"User: IS this product VEGAN, specifically the ..."
3,4847676011,Does this contain citric acid?,it is not listed as an ingredient.,User: Does this contain citric acid?\nAssistan...
4,4847676011,is this a paste or a gel?,It is a gel.,User: is this a paste or a gel?\nAssistant: It...


In [41]:
test_data.head()

Unnamed: 0,asin,question,answer,dialogue
0,4847676011,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...,User: Where is this made?\nAssistant: Made in ...
1,4847676011,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ...",User: Does this have an expiration date? Does ...
2,B00004X14K,Are these containers BPA free?,Sorry I do not know!,User: Are these containers BPA free?\nAssistan...
3,B00004X14K,Is it airtight?,"Not air tight, but it clicks closed. There is ...",User: Is it airtight?\nAssistant: Not air tigh...
4,B00006H36X,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...,User: Want for something safe for my 18 year o...


## Tune Pre-trained GPT-2

### Import required libraries

In [11]:
pip install 'accelerate>=0.26.0'

Note: you may need to restart the kernel to use updated packages.


In [23]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [19]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=ceeefb798c08648a00585436658103badad58d99577f2ddf0a9a117b79b38861
  Stored in directory: /home/sagemaker-user/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [17]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1


In [9]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
import gzip
import json
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import nltk
import evaluate
import contractions
import re
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

print('Loading complete')

# Set random seed to ensure reproducible results
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

2024-12-13 22:54:40.770708: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-13 22:54:40.784821: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-13 22:54:40.802622: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-13 22:54:40.808119: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-13 22:54:40.820615: I tensorflow/core/platform/cpu_feature_guar

Loading complete


In [25]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### Model Training

In [None]:
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

In [10]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

class LossLoggerCallback(TrainerCallback):
    def __init__(self):
        self.training_losses = []
        self.validation_losses = []
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            self.training_losses.append(logs['loss'])
        if logs and 'eval_loss' in logs:
            self.validation_losses.append(logs['eval_loss'])

class ConversationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.encodings = tokenizer(
            dataframe['dialogue'].tolist(),
            truncation=True,
            max_length=max_length,
            padding=True,
            return_tensors='pt'
        )
        self.input_ids = self.encodings['input_ids']
        self.attention_mask = self.encodings['attention_mask']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx],
        }


# Create dataset objects for training and validation sets
train_dataset = ConversationDataset(train_data, tokenizer)
val_dataset = ConversationDataset(test_data, tokenizer)

# Clear cache
torch.cuda.empty_cache()

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Resize model's vocabulary to match tokenizer
model.resize_token_embeddings(len(tokenizer))

# Configure LoRA parameters
lora_config = LoraConfig(
    r=8,  # Adjust to 8
    lora_alpha=16,  # Adjust to 16
    target_modules=["attn.c_proj"],  # Ensure correct target modules
    lora_dropout=0.1,  # Reduce dropout
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap model with PEFT
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir=model_dir+"gpt2_all",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    fp16=True,
    report_to="none",
    dataloader_num_workers=16,
    label_smoothing_factor=0.0,
    remove_unused_columns=False,  # Add this line
)


# Calculate total training steps and warm-up steps
total_steps = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs
warmup_steps = int(0.1 * total_steps)

# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Use DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Enable early stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)  # Reduce patience steps

# Initialize the loss logger
loss_logger = LossLoggerCallback()

# Define Trainer with the loss logger callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    callbacks=[early_stopping, loss_logger],  # Add loss logger here
)

# Start training
trainer.train()

# Plot the losses after training
epochs = range(1, len(loss_logger.training_losses) + 1)

plt.plot(epochs, loss_logger.training_losses, label="Training Loss")
plt.plot(epochs, loss_logger.validation_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss vs. Validation Loss")
plt.legend()
plt.show()


Epoch,Training Loss,Validation Loss
1,4.2526,3.976957
2,3.9942,3.693734
3,3.7332,3.512443
4,3.6321,3.465721
5,3.5953,3.44022
6,3.5753,3.423561
7,3.5647,3.412681
8,3.5394,3.404006
9,3.5324,3.396531
10,3.5177,3.389813


In [None]:
print("Model training completed.")

In [None]:
# Save final model
trainer.save_model(model_dir+"models_cli/gpt2")
tokenizer.save_pretrained(model_dir+"models_cli/gpt2")

In [20]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextGenerationPipeline
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import re
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Load sentence transformer model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize other models and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_dir+"models_cli/gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_dir+"models_cli/gpt2")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0 if device == 'cuda' else -1)

samples = test_data.reset_index(drop=True)

# Separate user and assistant dialogues
def extract_user_assistant(dialogue):
    user_pattern = r'User:(.*?)\n'
    assistant_pattern = r'Assistant:(.*?)\n'
    
    user_match = re.search(user_pattern, dialogue, re.DOTALL)
    assistant_match = re.search(assistant_pattern, dialogue, re.DOTALL)
    
    user = user_match.group(1).strip() if user_match else ''
    assistant = assistant_match.group(1).strip() if assistant_match else ''
    
    return user, assistant

samples[['User', 'Assistant']] = samples['dialogue'].apply(
    lambda x: pd.Series(extract_user_assistant(x))
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Device set to use cuda:0


In [40]:
samples[['User', 'Assistant']].head()

Unnamed: 0,User,Assistant
0,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...
1,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ..."
2,Are these containers BPA free?,Sorry I do not know!
3,Is it airtight?,"Not air tight, but it clicks closed. There is ..."
4,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...


In [26]:
# Generate model replies
def generate_answer(question, tokenizer, model, device, max_length=150):
    prompt = f"User: {question}\nAssistant:"
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + max_length,
            temperature=0.6,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.encode('\n')[0],
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.split("Assistant:")[-1].strip()
    answer = answer.split('\n')[0]
    
    return answer

samples['Generated_Assistant'] = samples['User'].apply(lambda x: generate_answer(x, tokenizer, model, device))

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

from bert_score import score as bert_score

def compute_metrics(row):
    actual = row['Assistant']
    generated = row['Generated_Assistant']
    
    # Calculate BLEU score
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(
        [nltk.word_tokenize(actual.lower())],
        nltk.word_tokenize(generated.lower()),
        smoothing_function=smoothie
    )
    
    # Calculate ROUGE scores
    rouge_scores = rouge.score(actual, generated)
    
    # Calculate semantic similarity (Cosine Similarity)
    cosine_sim = util.pytorch_cos_sim(
        sbert_model.encode(actual, convert_to_tensor=True),
        sbert_model.encode(generated, convert_to_tensor=True)
    ).item()
    
    # Calculate BERTScore (commented out)
    # P, R, F1 = bert_score([generated], [actual], lang="en", verbose=False)
    # bert_f1_score = F1.mean().item()
    
    # Return single values instead of Series
    return {
        'BLEU': float(bleu_score),  # Ensure float return type
        'ROUGE-1': float(rouge_scores['rouge1'].fmeasure),
        'ROUGE-2': float(rouge_scores['rouge2'].fmeasure),
        'ROUGE-L': float(rouge_scores['rougeL'].fmeasure),
        'Cosine_Similarity': float(cosine_sim),
        # 'BERTScore_F1': bert_f1_score
    }

# Calculate metrics for each sample
metrics_results = []
for idx, row in samples.iterrows():
    metrics = compute_metrics(row)
    metrics_results.append(metrics)

# Convert metrics to DataFrame and add to samples data
metrics_df = pd.DataFrame(metrics_results)
samples = samples.assign(**metrics_df)


# # Output results for each sample and format metrics
# for i in range(sample_size):
#     row = samples.iloc[i]
#     print(f"\nSample {i+1}:")
#     print(f"User: {row['User']}")
#     print(f"Actual Assistant: {row['Assistant']}")
#     print(f"Generated Assistant: {row['Generated_Assistant']}")
#     print("\nMetrics:")
    
#     # Convert Series to float format for formatted output
#     metrics = {
#         'BLEU': float(row['BLEU'].iloc[0] if isinstance(row['BLEU'], pd.Series) else row['BLEU']),
#         'ROUGE-1': float(row['ROUGE-1'].iloc[0] if isinstance(row['ROUGE-1'], pd.Series) else row['ROUGE-1']),
#         'ROUGE-2': float(row['ROUGE-2'].iloc[0] if isinstance(row['ROUGE-2'], pd.Series) else row['ROUGE-2']),
#         'ROUGE-L': float(row['ROUGE-L'].iloc[0] if isinstance(row['ROUGE-L'], pd.Series) else row['ROUGE-L']),
#         'Cosine_Similarity': float(row['Cosine_Similarity'].iloc[0] if isinstance(row['Cosine_Similarity'], pd.Series) else row['Cosine_Similarity']),
#         # 'BERTScore_F1': float(row['BERTScore_F1'].iloc[0] if isinstance(row['BERTScore_F1'], pd.Series) else row['BERTScore_F1'])
#     }
    
#     # Output formatted metrics
#     print(f"BLEU Score: {metrics['BLEU']:.4f}")
#     print(f"ROUGE-1: {metrics['ROUGE-1']:.4f}")
#     print(f"ROUGE-2: {metrics['ROUGE-2']:.4f}")
#     print(f"ROUGE-L: {metrics['ROUGE-L']:.4f}")
#     print(f"Cosine Similarity: {metrics['Cosine_Similarity']:.4f}")
#     # print(f"BERTScore (F1): {metrics['BERTScore_F1']:.4f}")
#     print("-" * 80)


# Output summary statistics
print("\nSummary Statistics:")
metrics_columns = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine_Similarity']
summary_stats = samples[metrics_columns].apply(lambda x: pd.to_numeric(x.iloc[0] if isinstance(x, pd.Series) else x)).describe()
print(summary_stats)



Summary Statistics:
count    5.000000
mean     0.071472
std      0.159817
min      0.000000
25%      0.000000
50%      0.000000
75%      0.000000
max      0.357362
dtype: float64


In [27]:
metrics_columns

['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine_Similarity']

In [28]:
samples.describe()

Unnamed: 0,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,Cosine_Similarity
count,5124.0,5124.0,5124.0,5124.0,5124.0
mean,0.019874,0.193598,0.036133,0.146402,0.354417
std,0.029541,0.104699,0.056829,0.083182,0.201352
min,0.0,0.0,0.0,0.0,-0.177093
25%,0.006497,0.122449,0.0,0.095238,0.207933
50%,0.013905,0.1875,0.019231,0.139535,0.357313
75%,0.024147,0.258142,0.053333,0.185567,0.499388
max,0.840896,1.0,1.0,1.0,0.994396


In [39]:
samples.head()

Unnamed: 0,asin,question,answer,dialogue,User,Assistant,Generated_Assistant,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,Cosine_Similarity
0,4847676011,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...,User: Where is this made?\nAssistant: Made in ...,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...,I have no idea. I have used it on my dog and s...,0.0,0.0,0.0,0.0,0.357362
1,4847676011,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ...",User: Does this have an expiration date? Does ...,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ...",I have a 10 lb. German Shepherd that is 10 yea...,0.010964,0.17284,0.0,0.148148,0.195674
2,B00004X14K,Are these containers BPA free?,Sorry I do not know!,User: Are these containers BPA free?\nAssistan...,Are these containers BPA free?,Sorry I do not know!,"Yes, they are BPA free.",0.0,0.0,0.0,0.0,0.14632
3,B00004X14K,Is it airtight?,"Not air tight, but it clicks closed. There is ...",User: Is it airtight?\nAssistant: Not air tigh...,Is it airtight?,"Not air tight, but it clicks closed. There is ...",No. The door is open but the cat is not able t...,0.003271,0.225352,0.028986,0.169014,0.254841
4,B00006H36X,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...,User: Want for something safe for my 18 year o...,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...,I would say go with the old ones. I think you ...,0.02954,0.23622,0.048,0.141732,0.234486


In [30]:
sorted_samples = samples.sort_values(by='BLEU', ascending=False)

In [38]:
sorted_samples.head()

Unnamed: 0,asin,question,answer,dialogue,User,Assistant,Generated_Assistant,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,Cosine_Similarity
1624,B00164PW9S,Does this come with a thermostat? If so can it...,It does not come with a thermostat,User: Does this come with a thermostat? If so ...,Does this come with a thermostat? If so can it...,It does not come with a thermostat,It does not come with a thermostat.,0.840896,1.0,1.0,1.0,0.95847
939,B000ELSM1E,Can you please confirm that this item comes wi...,yes it comes with 6 tray refills,User: Can you please confirm that this item co...,Can you please confirm that this item comes wi...,yes it comes with 6 tray refills,"Yes, it comes with 6 tray refills.",0.610474,1.0,1.0,1.0,0.994396
2014,B001OE1RDK,is this product silica free?,yes is silica free.,User: is this product silica free?\nAssistant:...,is this product silica free?,yes is silica free.,"Yes, it is silica free.",0.434721,0.888889,0.571429,0.888889,0.981865
3912,B007EDRCVG,Is this product made in the USA?,"Yes, this product is made in the USA.",User: Is this product made in the USA?\nAssist...,Is this product made in the USA?,"Yes, this product is made in the USA.",Yes it is made in the USA,0.419174,0.8,0.615385,0.8,0.922722
3363,B00500ITEO,What is the interior height of small size? Thanks,"Hello- it is about 18"", very roomy.",User: What is the interior height of small siz...,What is the interior height of small size? Thanks,"Hello- it is about 18"", very roomy.","I think it is about 18"" tall.",0.418013,0.571429,0.5,0.571429,0.713786
