## Load Data

In [1]:
import pandas as pd
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

data = pd.read_csv(data_dir+'filtered_df.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer,question_lang,answer_lang,text
0,10,open-ended,4847676011,"Oct 17, 2013",1381993000.0,Where is this made?,,Made in USA Distributed by: Sergeant's Pet Car...,en,en,User: Where is this made?\nAssistant: Made in ...
1,11,open-ended,4847676011,"Nov 10, 2013",1384070000.0,Does this have an expiration date? Does it onl...,,"yes it does have an expiration date. However, ...",en,en,User: Does this have an expiration date? Does ...
2,12,open-ended,4847676011,"Jan 11, 2014",1389427000.0,"does this have any type of sugar, grane alchol...",,"Here is the list of ingredients: Sorbitol, hyd...",en,en,"User: does this have any type of sugar, grane ..."
3,13,yes/no,4847676011,"Aug 7, 2013",1375859000.0,Does this contain citric acid?,?,it is not listed as an ingredient.,en,en,User: Does this contain citric acid?\nAssistan...
4,14,yes/no,4847676011,"Aug 20, 2014",1408518000.0,"IS this product VEGAN, specifically the glycerin?",?,It is my understanding that this product is no...,en,en,"User: IS this product VEGAN, specifically the ..."


In [2]:
data=data[['asin','question','answer']]

In [3]:
data.head()

Unnamed: 0,asin,question,answer
0,4847676011,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...
1,4847676011,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ..."
2,4847676011,"does this have any type of sugar, grane alchol...","Here is the list of ingredients: Sorbitol, hyd..."
3,4847676011,Does this contain citric acid?,it is not listed as an ingredient.
4,4847676011,"IS this product VEGAN, specifically the glycerin?",It is my understanding that this product is no...


In [4]:
# Create dialogue text
def create_dialogue_text(row):
    return f"User: {row['question']}\nAssistant: {row['answer']}\n"

data['dialogue'] = data.apply(create_dialogue_text, axis=1)

In [5]:
from sklearn.model_selection import train_test_split
# Initialize training and test sets
train_data_list = []
test_data_list = []
# Split for each asin
for asin, group in data.groupby('asin'):
    # If sample size > 1 for this asin, split 80-20 for train/test
    if len(group) > 1:
        train_group, test_group = train_test_split(group, test_size=0.2, random_state=42)
    else:
        # If only one sample, add it to training set
        train_group = group
        test_group = pd.DataFrame(columns=group.columns)  # Empty test set portion
    
    # Add results to training and test set lists
    train_data_list.append(train_group)
    test_data_list.append(test_group)

# Combine all asin training and test sets
train_data = pd.concat(train_data_list).reset_index(drop=True)
test_data = pd.concat(test_data_list).reset_index(drop=True)

# View count comparison of each asin in training and test sets
train_asin_counts = train_data['asin'].value_counts().reset_index()
train_asin_counts.columns = ['asin', 'train_count']

test_asin_counts = test_data['asin'].value_counts().reset_index()
test_asin_counts.columns = ['asin', 'test_count']

# Merge training and test set statistics
asin_counts = pd.merge(train_asin_counts, test_asin_counts, on='asin', how='outer').fillna(0)
asin_counts['train_count'] = asin_counts['train_count'].astype(int)
asin_counts['test_count'] = asin_counts['test_count'].astype(int)

In [6]:
asin_counts.head()

Unnamed: 0,asin,train_count,test_count
0,4847676011,5,2
1,B00004X14K,7,2
2,B00006H36X,5,2
3,B00006H373,4,1
4,B00006JHRE,8,2


In [7]:
train_data.head()

Unnamed: 0,asin,question,answer,dialogue
0,4847676011,does anyone know where this is made?,Believe it or not.... the USA!,User: does anyone know where this is made?\nAs...
1,4847676011,"does this have any type of sugar, grane alchol...","Here is the list of ingredients: Sorbitol, hyd...","User: does this have any type of sugar, grane ..."
2,4847676011,"IS this product VEGAN, specifically the glycerin?",It is my understanding that this product is no...,"User: IS this product VEGAN, specifically the ..."
3,4847676011,Does this contain citric acid?,it is not listed as an ingredient.,User: Does this contain citric acid?\nAssistan...
4,4847676011,is this a paste or a gel?,It is a gel.,User: is this a paste or a gel?\nAssistant: It...


In [8]:
test_data.head()

Unnamed: 0,asin,question,answer,dialogue
0,4847676011,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...,User: Where is this made?\nAssistant: Made in ...
1,4847676011,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ...",User: Does this have an expiration date? Does ...
2,B00004X14K,Are these containers BPA free?,Sorry I do not know!,User: Are these containers BPA free?\nAssistan...
3,B00004X14K,Is it airtight?,"Not air tight, but it clicks closed. There is ...",User: Is it airtight?\nAssistant: Not air tigh...
4,B00006H36X,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...,User: Want for something safe for my 18 year o...


## Tune Pre-trained GPT-2

### Import required libraries

In [11]:
# Import required libraries
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
import gzip
import json
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import nltk
import evaluate
import contractions
import re
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

print('Loading complete')

# Set random seed to ensure reproducible results
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

2025-01-31 21:27:39.253144: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-31 21:27:39.267911: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-31 21:27:39.286396: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-31 21:27:39.292153: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-31 21:27:39.305274: I tensorflow/core/platform/cpu_feature_guar

Loading complete


In [11]:
!pip install contractions

Collecting contractions
  Using cached contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Using cached textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Using cached anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Using cached pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Using cached contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Using cached textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Using cached anyascii-0.3.2-py3-none-any.whl (289 kB)
Using cached pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24


In [13]:
!pip install langdetect

Collecting langdetect
  Using cached langdetect-1.0.9-py3-none-any.whl
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
!pip install peft

In [19]:
!pip install 'accelerate>=0.26.0'



In [18]:
pip install transformers[torch]

Collecting accelerate>=0.26.0 (from transformers[torch])
  Using cached accelerate-1.3.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.3.0-py3-none-any.whl (336 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.21.0
    Uninstalling accelerate-0.21.0:
      Successfully uninstalled accelerate-0.21.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-multimodal 1.1.1 requires nvidia-ml-py3==7.352.0, which is not installed.
autogluon-multimodal 1.1.1 requires accelerate<0.22.0,>=0.21.0, but you have accelerate 1.3.0 which is incompatible.
autogluon-multimodal 1.1.1 requires jsonschema<4.22,>=4.18, but you have jsonschema 4.23.0 which is incompatible.
autogluon-multimodal 1.1.1 requires omegaconf<2.3.0,>=2.1.1, but you have omegaconf 2.3.0 which is incompatible.


In [9]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [22]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.1


In [31]:
!pip install bert_score

Collecting bert_score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
import nltk
nltk.download('punkt_tab')

### Model Training

In [9]:
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

In [12]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

class ConversationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.encodings = tokenizer(
            dataframe['dialogue'].tolist(),
            truncation=True,
            max_length=max_length,
            padding=True,
            return_tensors='pt'
        )
        self.input_ids = self.encodings['input_ids']
        self.attention_mask = self.encodings['attention_mask']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx],
        }


# Create dataset objects for training and validation sets
train_dataset = ConversationDataset(train_data, tokenizer)
val_dataset = ConversationDataset(test_data, tokenizer)

# Clear cache
torch.cuda.empty_cache()

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Resize model's vocabulary to match tokenizer
model.resize_token_embeddings(len(tokenizer))

# Configure LoRA parameters
lora_config = LoraConfig(
    r=16,  # More expressive adaptation
    lora_alpha=32,  # Strengthen LoRA scaling
    target_modules=["attn.c_proj", "attn.q_proj", "attn.k_proj", "attn.v_proj"],  # Apply LoRA to more layers
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap model with PEFT
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_all",
    overwrite_output_dir=True,
    num_train_epochs=200,  # Longer training
    per_device_train_batch_size=16,  # Larger batch size
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,  # Better weight updates
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    fp16=True,
    dataloader_num_workers=16,
    label_smoothing_factor=0.0,  # Helps prevent overfitting
    remove_unused_columns=False,  # Efficient batching
    # max_grad_norm=1.0,  # Gradient clipping
)

# Calculate total training steps and warm-up steps
total_steps = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs
warmup_steps = int(0.2 * total_steps)

# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Use DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Enable early stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)  # Reduce patience steps


# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    callbacks=[early_stopping],
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.2736,4.024562
2,4.2196,3.937354
3,4.0391,3.760592
4,3.8738,3.639539
5,3.7409,3.52208
6,3.6604,3.484627
7,3.6341,3.46074
8,3.6009,3.443944
9,3.588,3.43179
10,3.5682,3.42266


In [1]:
print("Model training completed.")

Model training completed.


In [None]:
# Save final model
trainer.save_model(model_dir+"models_cli/gpt2")
tokenizer.save_pretrained(model_dir+"models_cli/gpt2")

In [28]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextGenerationPipeline
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import re
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Load sentence transformer model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize other models and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_dir+'models_cli/gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_dir+'models_cli/gpt2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0 if device == 'cuda' else -1)

samples = test_data.reset_index(drop=True)

# Separate user and assistant dialogues
def extract_user_assistant(dialogue):
    user_pattern = r'User:(.*?)\n'
    assistant_pattern = r'Assistant:(.*?)\n'
    
    user_match = re.search(user_pattern, dialogue, re.DOTALL)
    assistant_match = re.search(assistant_pattern, dialogue, re.DOTALL)
    
    user = user_match.group(1).strip() if user_match else ''
    assistant = assistant_match.group(1).strip() if assistant_match else ''
    
    return user, assistant

samples[['User', 'Assistant']] = samples['dialogue'].apply(
    lambda x: pd.Series(extract_user_assistant(x))
)

Device set to use cuda:0


In [29]:
samples[['User', 'Assistant']].head()

Unnamed: 0,User,Assistant
0,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...
1,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ..."
2,Are these containers BPA free?,Sorry I do not know!
3,Is it airtight?,"Not air tight, but it clicks closed. There is ..."
4,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...


In [32]:
# Generate model replies
def generate_answer(question, tokenizer, model, device, max_length=150):
    prompt = f"User: {question}\nAssistant:"
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + max_length,
            temperature=0.6,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.encode('\n')[0],
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.split("Assistant:")[-1].strip()
    answer = answer.split('\n')[0]
    
    return answer

samples['Generated_Assistant'] = samples['User'].apply(lambda x: generate_answer(x, tokenizer, model, device))

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

from bert_score import score as bert_score

def compute_metrics(row):
    actual = row['Assistant']
    generated = row['Generated_Assistant']
    
    # Calculate BLEU score
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(
        [nltk.word_tokenize(actual.lower())],
        nltk.word_tokenize(generated.lower()),
        smoothing_function=smoothie
    )
    
    # Calculate ROUGE scores
    rouge_scores = rouge.score(actual, generated)
    
    # Calculate semantic similarity (Cosine Similarity)
    cosine_sim = util.pytorch_cos_sim(
        sbert_model.encode(actual, convert_to_tensor=True),
        sbert_model.encode(generated, convert_to_tensor=True)
    ).item()
    
    # Calculate BERTScore (commented out)
    # P, R, F1 = bert_score([generated], [actual], lang="en", verbose=False)
    # bert_f1_score = F1.mean().item()
    
    # Return single values instead of Series
    return {
        'BLEU': float(bleu_score),  # Ensure float return type
        'ROUGE-1': float(rouge_scores['rouge1'].fmeasure),
        'ROUGE-2': float(rouge_scores['rouge2'].fmeasure),
        'ROUGE-L': float(rouge_scores['rougeL'].fmeasure),
        'Cosine_Similarity': float(cosine_sim),
        # 'BERTScore_F1': bert_f1_score
    }

# Calculate metrics for each sample
metrics_results = []
for idx, row in samples.iterrows():
    metrics = compute_metrics(row)
    metrics_results.append(metrics)

# Convert metrics to DataFrame and add to samples data
metrics_df = pd.DataFrame(metrics_results)
samples = samples.assign(**metrics_df)


# # Output results for each sample and format metrics
# for i in range(sample_size):
#     row = samples.iloc[i]
#     print(f"\nSample {i+1}:")
#     print(f"User: {row['User']}")
#     print(f"Actual Assistant: {row['Assistant']}")
#     print(f"Generated Assistant: {row['Generated_Assistant']}")
#     print("\nMetrics:")
    
#     # Convert Series to float format for formatted output
#     metrics = {
#         'BLEU': float(row['BLEU'].iloc[0] if isinstance(row['BLEU'], pd.Series) else row['BLEU']),
#         'ROUGE-1': float(row['ROUGE-1'].iloc[0] if isinstance(row['ROUGE-1'], pd.Series) else row['ROUGE-1']),
#         'ROUGE-2': float(row['ROUGE-2'].iloc[0] if isinstance(row['ROUGE-2'], pd.Series) else row['ROUGE-2']),
#         'ROUGE-L': float(row['ROUGE-L'].iloc[0] if isinstance(row['ROUGE-L'], pd.Series) else row['ROUGE-L']),
#         'Cosine_Similarity': float(row['Cosine_Similarity'].iloc[0] if isinstance(row['Cosine_Similarity'], pd.Series) else row['Cosine_Similarity']),
#         # 'BERTScore_F1': float(row['BERTScore_F1'].iloc[0] if isinstance(row['BERTScore_F1'], pd.Series) else row['BERTScore_F1'])
#     }
    
#     # Output formatted metrics
#     print(f"BLEU Score: {metrics['BLEU']:.4f}")
#     print(f"ROUGE-1: {metrics['ROUGE-1']:.4f}")
#     print(f"ROUGE-2: {metrics['ROUGE-2']:.4f}")
#     print(f"ROUGE-L: {metrics['ROUGE-L']:.4f}")
#     print(f"Cosine Similarity: {metrics['Cosine_Similarity']:.4f}")
#     # print(f"BERTScore (F1): {metrics['BERTScore_F1']:.4f}")
#     print("-" * 80)


# Output summary statistics
print("\nSummary Statistics:")
metrics_columns = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine_Similarity']
summary_stats = samples[metrics_columns].apply(lambda x: pd.to_numeric(x.iloc[0] if isinstance(x, pd.Series) else x)).describe()
print(summary_stats)



Summary Statistics:
count    5.000000
mean     0.224270
std      0.202562
min      0.003081
25%      0.125000
50%      0.222222
75%      0.222222
max      0.548827
dtype: float64


In [33]:
metrics_columns

['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine_Similarity']

In [34]:
samples.describe()

Unnamed: 0,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,Cosine_Similarity
count,5124.0,5124.0,5124.0,5124.0,5124.0
mean,0.019936,0.196107,0.036671,0.148743,0.367421
std,0.030105,0.103517,0.057322,0.084737,0.1985
min,0.0,0.0,0.0,0.0,-0.19223
25%,0.005946,0.125,0.0,0.095482,0.225701
50%,0.013544,0.191553,0.01878,0.14124,0.373625
75%,0.024359,0.26087,0.054545,0.187634,0.508313
max,1.0,1.0,1.0,1.0,1.0


In [35]:
samples.head()

Unnamed: 0,asin,question,answer,dialogue,User,Assistant,Generated_Assistant,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,Cosine_Similarity
0,4847676011,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...,User: Where is this made?\nAssistant: Made in ...,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...,Made in China.,0.003081,0.222222,0.125,0.222222,0.548827
1,4847676011,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ...",User: Does this have an expiration date? Does ...,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ...","I have not had this issue with my cat, so I ca...",0.020189,0.204545,0.0,0.159091,0.122072
2,B00004X14K,Are these containers BPA free?,Sorry I do not know!,User: Are these containers BPA free?\nAssistan...,Are these containers BPA free?,Sorry I do not know!,They are not. I have been using them for over ...,0.019731,0.222222,0.0,0.111111,0.140181
3,B00004X14K,Is it airtight?,"Not air tight, but it clicks closed. There is ...",User: Is it airtight?\nAssistant: Not air tigh...,Is it airtight?,"Not air tight, but it clicks closed. There is ...",It is not airtight. It is a very sturdy piece ...,0.012754,0.25,0.076923,0.225,0.594254
4,B00006H36X,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...,User: Want for something safe for my 18 year o...,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...,I would recommend that you look at the safety ...,0.006321,0.144737,0.0,0.105263,0.282334


In [36]:
sorted_samples = samples.sort_values(by='BLEU', ascending=False)

In [37]:
sorted_samples.head()

Unnamed: 0,asin,question,answer,dialogue,User,Assistant,Generated_Assistant,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,Cosine_Similarity
1835,B001CHXJSK,Does this filter come with one or more bio-bag...,"Yes, it comes with one.",User: Does this filter come with one or more b...,Does this filter come with one or more bio-bag...,"Yes, it comes with one.","Yes, it comes with one.",1.0,1.0,1.0,1.0,1.0
984,B000FPH2I8,is it made in the USA,Yes it is made in the USA and is a great product.,User: is it made in the USA\nAssistant: Yes it...,is it made in the USA,Yes it is made in the USA and is a great product.,Yes it is made in the USA.,0.465379,0.736842,0.705882,0.736842,0.94275
4542,B00AN0PRNW,Is this a bark collar as well in that it preve...,"No, not a bark collar.",User: Is this a bark collar as well in that it...,Is this a bark collar as well in that it preve...,"No, not a bark collar.","Yes, it is a bark collar.",0.365555,0.545455,0.444444,0.545455,0.943956
2341,B002RT8M9I,Can you tell the water level by looking at it?...,"Yes, it is clear enough to tell",User: Can you tell the water level by looking ...,Can you tell the water level by looking at it?...,"Yes, it is clear enough to tell","Yes, it is solid.",0.364093,0.545455,0.444444,0.545455,0.316334
957,B000F8IXOW,Is it made in United States?,Yes my label says made in the USA,User: Is it made in United States?\nAssistant:...,Is it made in United States?,Yes my label says made in the USA,It is made in the USA.,0.356403,0.571429,0.5,0.571429,0.739864
