## Import Required Libraries

In [1]:
!pip install contractions



In [2]:
!pip install langdetect



In [3]:
!pip install peft



In [4]:
pip install 'accelerate>=0.26.0'

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
import gzip
import json
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import nltk
import evaluate
import contractions
import re
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model
import warnings

# Ignore all warnings 
warnings.filterwarnings("ignore")

print('Import Finished')

# Set random seed to ensure reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

2024-12-21 01:26:36.543277: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-21 01:26:36.557943: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-21 01:26:36.576742: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-21 01:26:36.582446: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-21 01:26:36.595625: I tensorflow/core/platform/cpu_feature_guar

Import Finished


## Data Preprocessing

In [7]:
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

In [8]:
# Data loading and preprocessing functions
def parse(path):
    with gzip.open(path, 'rb') as g:
        for l in g:
            yield eval(l)

def getDF(path):
    df = {}
    for i, d in enumerate(parse(path)):
        df[i] = d
    return pd.DataFrame.from_dict(df, orient='index')

# Load dataset
data_path = data_dir + 'qa_Pet_Supplies.json.gz'  # Make sure this file exists in current directory
df = getDF(data_path)
print("Dataset loaded. First 5 rows example:")
print(df.head())

# Remove missing values
df = df.dropna(subset=['question', 'answer'])

# Text preprocessing function
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Replace URLs
    text = re.sub(r'http\S+|www.\S+', '[URL]', text)
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['question'] = df['question'].apply(preprocess_text)
df['answer'] = df['answer'].apply(preprocess_text)

# Language detection
DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

df['question_lang'] = df['question'].apply(detect_language)
df['answer_lang'] = df['answer'].apply(detect_language)

# Count language distribution
question_lang_counts = df['question_lang'].value_counts()
answer_lang_counts = df['answer_lang'].value_counts()

print("Question language distribution:")
print(question_lang_counts)

print("\nAnswer language distribution:")
print(answer_lang_counts)

# Define target languages
target_languages = ['en']

# Filter target languages
filtered_df = df[
    (df['question_lang'].isin(target_languages)) &
    (df['answer_lang'].isin(target_languages))
]

print(f"\nDataset size after filtering: {filtered_df.shape}")

# Expand contractions
def expand_contractions(text):
    return contractions.fix(text)

filtered_df['question'] = filtered_df['question'].apply(expand_contractions)
filtered_df['answer'] = filtered_df['answer'].apply(expand_contractions)

# Load ASIN list
id_asin_path = data_dir+'unique_asin.csv'
if not os.path.exists(id_asin_path):
    raise FileNotFoundError(f"File {id_asin_path} does not exist. Please ensure the file is in the current directory.")
id_asin = pd.read_csv(id_asin_path)

# Filter ASIN
filtered_df = filtered_df[filtered_df['asin'].isin(id_asin['asin'])]
print(f"\nDataset size after ASIN filtering: {filtered_df.shape}")

Dataset loaded. First 5 rows example:
  questionType        asin    answerTime      unixTime  \
0       yes/no  0975412809  Nov 11, 2014  1.415693e+09   
1       yes/no  0975412809  Apr 13, 2014  1.397372e+09   
2       yes/no  0975412809  Dec 29, 2014  1.419840e+09   
3       yes/no  0975412809  Nov 18, 2014  1.416298e+09   
4       yes/no  0975412809  Nov 11, 2014  1.415693e+09   

                                            question answerType  \
0  I don't know what grooming tools I need for my...          ?   
1              Is the bouvier des flandres included?          Y   
2  Does it specifically cover how to keep male ge...          ?   
3                    Does it cover anything on cats?          N   
4  I am having trouble grooming my Irish terrier'...          ?   

                                              answer  
0  Hello. It is very difficult to say "all groomi...  
1  Yes it is, in the herding group. The book has ...  
2                        Not that I have seen

In [9]:
# Remove duplicates and short texts
filtered_df = filtered_df.drop_duplicates(subset=['question','answer'], keep='first')  # Remove duplicates
filtered_df = filtered_df[filtered_df['question'].str.len() > 10]
filtered_df = filtered_df[filtered_df['answer'].str.len() > 10]

# Remove rows containing unexpected characters
filtered_df = filtered_df[~filtered_df['question'].str.contains('[^\x00-\x7F]+')]
filtered_df = filtered_df[~filtered_df['answer'].str.contains('[^\x00-\x7F]+')]

In [10]:
# Create dialogue text
def create_dialogue_text(row):
    return f"User: {row['question']}\nAssistant: {row['answer']}\n"

filtered_df['text'] = filtered_df.apply(create_dialogue_text, axis=1)

print(f"\nFinal dataset size ready for training: {filtered_df.shape}")


Final dataset size ready for training: (24570, 10)


In [11]:
filtered_df.head(1)

Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer,question_lang,answer_lang,text
10,open-ended,4847676011,"Oct 17, 2013",1381993000.0,Where is this made?,,Made in USA Distributed by: Sergeant's Pet Car...,en,en,User: Where is this made?\nAssistant: Made in ...


In [12]:
filtered_df.asin.value_counts()

asin
B0045UGVKY    10
B000256962    10
B00006OALW    10
B0045LAS3O    10
B0045L29UO    10
              ..
B001IMN8GW     1
B000OAVN0W     1
B000OXAES6     1
B000OXAER2     1
B000QFWCJ6     1
Name: count, Length: 3520, dtype: int64

In [13]:
# Get count for each ASIN
asin_counts = filtered_df.asin.value_counts()

# Filter ASINs with count >= 5
filtered_asins = asin_counts[asin_counts >= 5]

# Print results
print(f"Number of ASINs after filtering: {len(filtered_asins)}")
print("\nTop 10 frequent ASINs and their counts:")
print(filtered_asins.head(10))

Number of ASINs after filtering: 2733

Top 10 frequent ASINs and their counts:
asin
B0045UGVKY    10
B000256962    10
B00006OALW    10
B0045LAS3O    10
B0045L29UO    10
B00H8MDLUE    10
B004ABHBWK    10
B0045Y1JGQ    10
B00006JHRE    10
B0045Y1JG6    10
Name: count, dtype: int64


In [14]:
filtered_df = filtered_df[filtered_df.asin.isin(filtered_asins.index)]

In [15]:
filtered_df.to_csv(data_dir+'filtered_df.csv')

In [16]:
data=filtered_df[['asin','question','answer','questionType']]

In [17]:
# Create dialogue text
def create_dialogue_text(row):
    return f"User: {row['question']}\nAssistant: {row['answer']}\n"

data['dialogue'] = data.apply(create_dialogue_text, axis=1)

In [18]:
from sklearn.model_selection import train_test_split
# Initialize training and test sets
train_data_list = []
test_data_list = []
# Split for each asin
for asin, group in data.groupby('asin'):
    # If sample size > 1 for this asin, split 80-20 for train/test
    if len(group) > 1:
        train_group, test_group = train_test_split(group, test_size=0.2, random_state=42)
    else:
        # If only one sample, add it to training set
        train_group = group
        test_group = pd.DataFrame(columns=group.columns)  # Empty test set portion
    
    # Add results to training and test set lists
    train_data_list.append(train_group)
    test_data_list.append(test_group)

# Combine all asin training and test sets
train_data = pd.concat(train_data_list).reset_index(drop=True)
test_data = pd.concat(test_data_list).reset_index(drop=True)

# View count comparison of each asin in training and test sets
train_asin_counts = train_data['asin'].value_counts().reset_index()
train_asin_counts.columns = ['asin', 'train_count']

test_asin_counts = test_data['asin'].value_counts().reset_index()
test_asin_counts.columns = ['asin', 'test_count']

# Merge training and test set statistics
asin_counts = pd.merge(train_asin_counts, test_asin_counts, on='asin', how='outer').fillna(0)
asin_counts['train_count'] = asin_counts['train_count'].astype(int)
asin_counts['test_count'] = asin_counts['test_count'].astype(int)

In [19]:
asin_counts.head()

Unnamed: 0,asin,train_count,test_count
0,4847676011,5,2
1,B00004X14K,7,2
2,B00006H36X,5,2
3,B00006H373,4,1
4,B00006JHRE,8,2


In [20]:
train_data.head()

Unnamed: 0,asin,question,answer,questionType,dialogue
0,4847676011,does anyone know where this is made?,Believe it or not.... the USA!,open-ended,User: does anyone know where this is made?\nAs...
1,4847676011,"does this have any type of sugar, grane alchol...","Here is the list of ingredients: Sorbitol, hyd...",open-ended,"User: does this have any type of sugar, grane ..."
2,4847676011,"IS this product VEGAN, specifically the glycerin?",It is my understanding that this product is no...,yes/no,"User: IS this product VEGAN, specifically the ..."
3,4847676011,Does this contain citric acid?,it is not listed as an ingredient.,yes/no,User: Does this contain citric acid?\nAssistan...
4,4847676011,is this a paste or a gel?,It is a gel.,open-ended,User: is this a paste or a gel?\nAssistant: It...


In [21]:
test_data.head()

Unnamed: 0,asin,question,answer,questionType,dialogue
0,4847676011,Where is this made?,Made in USA Distributed by: Sergeant's Pet Car...,open-ended,User: Where is this made?\nAssistant: Made in ...
1,4847676011,Does this have an expiration date? Does it onl...,"yes it does have an expiration date. However, ...",open-ended,User: Does this have an expiration date? Does ...
2,B00004X14K,Are these containers BPA free?,Sorry I do not know!,yes/no,User: Are these containers BPA free?\nAssistan...
3,B00004X14K,Is it airtight?,"Not air tight, but it clicks closed. There is ...",yes/no,User: Is it airtight?\nAssistant: Not air tigh...
4,B00006H36X,Want for something safe for my 18 year old ind...,the advantage II is a good product. have used ...,open-ended,User: Want for something safe for my 18 year o...


## Question Types Classifier (temp)

In [22]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 2 represents two types: Yes/No and Open-ended

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from torch.utils.data import Dataset

class QuestionTypeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.questions = dataframe['question'].tolist()
        self.labels = dataframe['questionType'].apply(lambda x: 1 if x == 'open-ended' else 0).tolist()  # 1 for open-ended, 0 for yes/no
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = str(self.questions[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = QuestionTypeDataset(train_data, tokenizer)
val_dataset = QuestionTypeDataset(test_data, tokenizer)

In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=model_dir+'question_type_classifier',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=log_dir+'question_type_classifier',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
)


In [25]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [26]:
# import torch
torch.cuda.empty_cache()

In [None]:
from transformers import Trainer

# Create dataset objects
train_dataset = QuestionTypeDataset(train_data, tokenizer)
val_dataset = QuestionTypeDataset(test_data, tokenizer)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0677,0.052921,0.986924,0.987508,0.989848,0.988677
2,0.0019,0.036589,0.992584,0.990252,0.996954,0.993592
3,0.008,0.023809,0.994536,0.992595,0.99797,0.995275
4,0.0005,0.03211,0.994145,0.993921,0.995939,0.994929


In [None]:
# Save model and tokenizer
model_save_path = model_dir+'question_type_classifier'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Question type classifier has been saved to {model_save_path}")

In [None]:
# Create dialogue text
def create_dialogue_text(row):
    return f"User: {row['question']}\nAssistant: {row['answer']}\n"

filtered_df['dialogue'] = filtered_df.apply(create_dialogue_text, axis=1)

# Group by question type
yes_no_df = filtered_df[filtered_df['questionType'] == 'yes/no']
open_ended_df = filtered_df[filtered_df['questionType'] == 'open-ended']

print(f"Number of Yes/No questions: {yes_no_df.shape[0]}")
print(f"Number of open-ended questions: {open_ended_df.shape[0]}")

In [None]:
# Save dialogue texts
yes_no_df['dialogue'].to_csv(data_dir+'yes_no_dialogues.txt', index=False, header=False)
open_ended_df['dialogue'].to_csv(data_dir+'open_ended_dialogues.txt', index=False, header=False)

In [None]:
filtered_df.shape

In [None]:
from sklearn.model_selection import train_test_split

# Split yes_no_df into training and validation sets
train_yes_no_df, val_yes_no_df = train_test_split(yes_no_df, test_size=0.1, random_state=42)

print(f"Training set size: {train_yes_no_df.shape[0]}")
print(f"Validation set size: {val_yes_no_df.shape[0]}")

In [None]:
val_yes_no_df['dialogue']

In [None]:
yes_no_df.head()['dialogue'][13]

In [None]:
yes_no_df.head()['dialogue'][14]

In [None]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

# Reset indices
train_yes_no_df = train_yes_no_df.reset_index(drop=True)
val_yes_no_df = val_yes_no_df.reset_index(drop=True)

In [None]:
# Clear cache
torch.cuda.empty_cache()

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

class ConversationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.encodings = tokenizer(
            dataframe['dialogue'].tolist(),
            truncation=True,
            max_length=max_length,
            padding=True,
            return_tensors='pt'
        )
        self.input_ids = self.encodings['input_ids']
        self.attention_mask = self.encodings['attention_mask']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx],
        }

# Create dataset objects for training and validation sets
train_dataset = ConversationDataset(train_yes_no_df, tokenizer)
val_dataset = ConversationDataset(val_yes_no_df, tokenizer)

# Clear cache
torch.cuda.empty_cache()

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Resize model's vocabulary to match tokenizer
model.resize_token_embeddings(len(tokenizer))

# Configure LoRA parameters
lora_config = LoraConfig(
    r=8,  # Adjusted to 8
    lora_alpha=16,  # Adjusted to 16
    target_modules=["attn.c_proj"],  # Ensure correct target modules
    lora_dropout=0.1,  # Reduced dropout
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap model with PEFT
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir=model_dir+"gpt2_yes_no",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    fp16=True,
    report_to="none",
    dataloader_num_workers=16,
    label_smoothing_factor=0.0,
    remove_unused_columns=False,  # Added this line
)

# Calculate total training steps and warm-up steps
total_steps = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs
warmup_steps = int(0.1 * total_steps)

# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Use DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Enable early stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)  # Reduced patience steps

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    callbacks=[early_stopping],
)

# Start training
trainer.train()

print("Model training completed.")

In [None]:
# Save final model
trainer.save_model(model_dir+"gpt2_yes_no")
tokenizer.save_pretrained(model_dir+"gpt2_yes_no")

In [None]:
# Create dialogue text
def create_dialogue_text(row):
    return f"User: {row['question']}\nAssistant: {row['answer']}\n"

filtered_df['dialogue'] = filtered_df.apply(create_dialogue_text, axis=1)

# Group by and select open-ended questions
open_ended_df = filtered_df[filtered_df['questionType'] == 'open-ended']

print(f"Number of open-ended questions: {open_ended_df.shape[0]}")

# Save dialogue text
open_ended_df['dialogue'].to_csv(data_dir+'open_ended_dialogues.txt', index=False, header=False)

from sklearn.model_selection import train_test_split

# Split open_ended_df into training and validation sets
train_open_ended_df, val_open_ended_df = train_test_split(open_ended_df, test_size=0.1, random_state=42)

# Save training and validation sets
train_open_ended_df['dialogue'].to_csv(data_dir+'open_ended_dialogues_train.txt', index=False, header=False)
val_open_ended_df['dialogue'].to_csv(data_dir+'open_ended_dialogues_val.txt', index=False, header=False)

print(f"Training set size: {train_open_ended_df.shape[0]}")
print(f"Validation set size: {val_open_ended_df.shape[0]}")

# Load GPT-2 tokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Reset indices
train_open_ended_df = train_open_ended_df.reset_index(drop=True)
val_open_ended_df = val_open_ended_df.reset_index(drop=True)


class ConversationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.encodings = tokenizer(
            dataframe['dialogue'].tolist(),
            truncation=True,
            max_length=max_length,
            padding=True,
            return_tensors='pt'
        )
        self.input_ids = self.encodings['input_ids']
        self.attention_mask = self.encodings['attention_mask']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx],
        }

# Create dataset objects for training and validation sets
train_dataset = ConversationDataset(train_open_ended_df, tokenizer)
val_dataset = ConversationDataset(val_open_ended_df, tokenizer)

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Resize model's vocabulary to match tokenizer
model.resize_token_embeddings(len(tokenizer))

# Configure LoRA parameters
lora_config = LoraConfig(
    r=8,  # Adjusted to 8
    lora_alpha=16,  # Adjusted to 16
    target_modules=["attn.c_proj"],  # Ensure correct target modules
    lora_dropout=0.1,  # Reduced dropout
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap model with PEFT
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir=model_dir+"gpt2_open_ended",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    fp16=True,
    report_to="none",
    dataloader_num_workers=16,
    label_smoothing_factor=0.0,
    remove_unused_columns=False,  # Added this line
)

# Use DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Enable early stopping
from transformers import EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping],
)

# Start training
trainer.train()

print("Model training completed.")

In [None]:
# Save final model
trainer.save_model(model_dir+"gpt2_open_ended")
tokenizer.save_pretrained(model_dir+"gpt2_open_ended")

In [46]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextGenerationPipeline
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import re
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Load sentence transformer model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize other models and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_dir+'gpt2_yes_no')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_dir+'gpt2_yes_no')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0 if device == 'cuda' else -1)

# 2. Extract 100 validation samples
sample_size = 100
samples = val_yes_no_df.sample(n=sample_size, random_state=42).reset_index(drop=True)

# Separate user and assistant dialogues
def extract_user_assistant(dialogue):
    user_pattern = r'User:(.*?)\n'
    assistant_pattern = r'Assistant:(.*?)\n'
    
    user_match = re.search(user_pattern, dialogue, re.DOTALL)
    assistant_match = re.search(assistant_pattern, dialogue, re.DOTALL)
    
    user = user_match.group(1).strip() if user_match else ''
    assistant = assistant_match.group(1).strip() if assistant_match else ''
    
    return user, assistant

samples[['User', 'Assistant']] = samples['dialogue'].apply(
    lambda x: pd.Series(extract_user_assistant(x))
)

Device set to use cuda:0


In [47]:
samples[['User', 'Assistant']]

Unnamed: 0,User,Assistant
0,Does it fit well on all types of handlebars?,Well I tried it on a mountain bike and it work...
1,Will these work for a 5.7lb 4 month old puppy?,"I would say yes. I use them on my doxies, most..."
2,"Does anyone know where replacement blades, etc...",lister blades will work from what i am told fr...
3,Can a Dwarf Hamster escape from this cage?,Very easily. The gaps in the wires are huge. M...
4,Can I use this skimmer with a 110volt power so...,Yes. It has a standard (American) three-prong ...
...,...,...
95,I have a Ford F150. Is this long enough to cov...,I have a 2006 tundra 4door and it is long enough.
96,I cannot find the instructions. Are they avail...,HI.......the instruction booklet is included i...
97,will this lamp work in dog house safely?,"Yes, it should help significantly, but I would..."
98,is this for a small dog? looking for one to fi...,Yes. we use it on a 10 pound Dachshund. Will h...


In [48]:
# Generate model replies
def generate_answer(question, tokenizer, model, device, max_length=150):
    prompt = f"User: {question}\nAssistant:"
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + max_length,
            temperature=0.6,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.encode('\n')[0],
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.split("Assistant:")[-1].strip()
    answer = answer.split('\n')[0]
    
    return answer

samples['Generated_Assistant'] = samples['User'].apply(lambda x: generate_answer(x, tokenizer, model, device))

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
from bert_score import score as bert_score

def compute_metrics(row):
    actual = row['Assistant']
    generated = row['Generated_Assistant']
    
    # Calculate BLEU score
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(
        [nltk.word_tokenize(actual.lower())],
        nltk.word_tokenize(generated.lower()),
        smoothing_function=smoothie
    )
    
    # Calculate ROUGE scores
    rouge_scores = rouge.score(actual, generated)
    
    # Calculate semantic similarity (Cosine Similarity)
    cosine_sim = util.pytorch_cos_sim(
        sbert_model.encode(actual, convert_to_tensor=True),
        sbert_model.encode(generated, convert_to_tensor=True)
    ).item()
    
    # Calculate BERTScore
    # P, R, F1 = bert_score([generated], [actual], lang="en", verbose=False)
    # bert_f1_score = F1.mean().item()
    
    # Return single values instead of Series
    return {
        'BLEU': float(bleu_score),  # Ensure float return type
        'ROUGE-1': float(rouge_scores['rouge1'].fmeasure),
        'ROUGE-2': float(rouge_scores['rouge2'].fmeasure),
        'ROUGE-L': float(rouge_scores['rougeL'].fmeasure),
        'Cosine_Similarity': float(cosine_sim),
        # 'BERTScore_F1': bert_f1_score
    }

# Calculate metrics for each sample
metrics_results = []
for idx, row in samples.iterrows():
    metrics = compute_metrics(row)
    metrics_results.append(metrics)

# Convert metrics to DataFrame and add to samples
metrics_df = pd.DataFrame(metrics_results)
samples = samples.assign(**metrics_df)

# Output results for each sample and format metrics
for i in range(sample_size):
    row = samples.iloc[i]
    print(f"\nSample {i+1}:")
    print(f"User: {row['User']}")
    print(f"Actual Assistant: {row['Assistant']}")
    print(f"Generated Assistant: {row['Generated_Assistant']}")
    print("\nMetrics:")
    
    # Convert Series to float format for output formatting
    metrics = {
        'BLEU': float(row['BLEU'].iloc[0] if isinstance(row['BLEU'], pd.Series) else row['BLEU']),
        'ROUGE-1': float(row['ROUGE-1'].iloc[0] if isinstance(row['ROUGE-1'], pd.Series) else row['ROUGE-1']),
        'ROUGE-2': float(row['ROUGE-2'].iloc[0] if isinstance(row['ROUGE-2'], pd.Series) else row['ROUGE-2']),
        'ROUGE-L': float(row['ROUGE-L'].iloc[0] if isinstance(row['ROUGE-L'], pd.Series) else row['ROUGE-L']),
        'Cosine_Similarity': float(row['Cosine_Similarity'].iloc[0] if isinstance(row['Cosine_Similarity'], pd.Series) else row['Cosine_Similarity']),
        # 'BERTScore_F1': float(row['BERTScore_F1'].iloc[0] if isinstance(row['BERTScore_F1'], pd.Series) else row['BERTScore_F1'])
    }
    
    # Output formatted metrics
    print(f"BLEU Score: {metrics['BLEU']:.4f}")
    print(f"ROUGE-1: {metrics['ROUGE-1']:.4f}")
    print(f"ROUGE-2: {metrics['ROUGE-2']:.4f}")
    print(f"ROUGE-L: {metrics['ROUGE-L']:.4f}")
    print(f"Cosine Similarity: {metrics['Cosine_Similarity']:.4f}")
    # print(f"BERTScore (F1): {metrics['BERTScore_F1']:.4f}")
    print("-" * 80)

# Output summary statistics
print("\nSummary Statistics:")
metrics_columns = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine_Similarity']
summary_stats = samples[metrics_columns].apply(lambda x: pd.to_numeric(x.iloc[0] if isinstance(x, pd.Series) else x)).describe()
print(summary_stats)



Sample 1:
User: Does it fit well on all types of handlebars?
Actual Assistant: Well I tried it on a mountain bike and it worked nicely because the handle bars have a good distance from the wheel. Probably not good with a racing bike where the handle bars are very low.
Generated Assistant: I have one that fits perfectly.

Metrics:
BLEU Score: 0.0004
ROUGE-1: 0.0930
ROUGE-2: 0.0000
ROUGE-L: 0.0930
Cosine Similarity: 0.0680
--------------------------------------------------------------------------------

Sample 2:
User: Will these work for a 5.7lb 4 month old puppy?
Actual Assistant: I would say yes. I use them on my doxies, mostly an eight pound mini dachshund and they work great.
Generated Assistant: I would not recommend this product as it is not strong enough to hold a dog.

Metrics:
BLEU Score: 0.0268
ROUGE-1: 0.1111
ROUGE-2: 0.0588
ROUGE-L: 0.1111
Cosine Similarity: 0.4110
--------------------------------------------------------------------------------

Sample 3:
User: Does anyone 

In [49]:


# Validate model generation performance
from transformers import TextGenerationPipeline
import pandas as pd
import re
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

# Load sentence transformer model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize other models and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_dir+'gpt2_open_ended')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_dir+'gpt2_open_ended')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0 if device == 'cuda' else -1)

# Extract 100 validation samples
sample_size = 100
samples = val_open_ended_df.sample(n=sample_size, random_state=42).reset_index(drop=True)

# Separate user and assistant dialogues
def extract_user_assistant(dialogue):
    user_pattern = r'User:(.*?)\n'
    assistant_pattern = r'Assistant:(.*?)\n'
    
    user_match = re.search(user_pattern, dialogue, re.DOTALL)
    assistant_match = re.search(assistant_pattern, dialogue, re.DOTALL)
    
    user = user_match.group(1).strip() if user_match else ''
    assistant = assistant_match.group(1).strip() if assistant_match else ''
    
    return user, assistant

samples[['User', 'Assistant']] = samples['dialogue'].apply(
    lambda x: pd.Series(extract_user_assistant(x))
)

# Generate model replies
def generate_answer(question, tokenizer, model, device, max_length=150):
    prompt = f"User: {question}\nAssistant:"
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + max_length,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.encode('\n')[0],
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.split("Assistant:")[-1].strip()
    answer = answer.split('\n')[0]
    
    return answer

samples['Generated_Assistant'] = samples['User'].apply(lambda x: generate_answer(x, tokenizer, model, device))

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

from bert_score import score as bert_score
def compute_metrics(row):
    actual = row['Assistant']
    generated = row['Generated_Assistant']
    
    # Calculate BLEU score
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(
        [nltk.word_tokenize(actual.lower())],
        nltk.word_tokenize(generated.lower()),
        smoothing_function=smoothie
    )

    # Calculate ROUGE scores
    rouge_scores = rouge.score(actual, generated)
    
    # Calculate semantic similarity (Cosine Similarity)
    cosine_sim = util.pytorch_cos_sim(
        sbert_model.encode(actual, convert_to_tensor=True),
        sbert_model.encode(generated, convert_to_tensor=True)
    ).item()
    
    # Calculate BERTScore
    P, R, F1 = bert_score([generated], [actual], lang="en", verbose=False)
    bert_f1_score = F1.mean().item()
    
    # Return single values instead of Series
    return {
        'BLEU': float(bleu_score),  # Ensure float return type
        'ROUGE-1': float(rouge_scores['rouge1'].fmeasure),
        'ROUGE-2': float(rouge_scores['rouge2'].fmeasure),
        'ROUGE-L': float(rouge_scores['rougeL'].fmeasure),
        'Cosine_Similarity': float(cosine_sim),
        'BERTScore_F1': bert_f1_score
    }

# Calculate metrics for each sample
metrics_results = []
for idx, row in samples.iterrows():
    metrics = compute_metrics(row)
    metrics_results.append(metrics)

# Convert metrics to DataFrame and add to samples
metrics_df = pd.DataFrame(metrics_results)
samples = samples.assign(**metrics_df)

# Output results for each sample and format metrics
for i in range(sample_size):
    row = samples.iloc[i]
    print(f"\nSample {i+1}:")
    print(f"User: {row['User']}")
    print(f"Actual Assistant: {row['Assistant']}")
    print(f"Generated Assistant: {row['Generated_Assistant']}")
    print("\nMetrics:")
    
    # Convert Series to float format for output formatting
    metrics = {
        'BLEU': float(row['BLEU'].iloc[0] if isinstance(row['BLEU'], pd.Series) else row['BLEU']),
        'ROUGE-1': float(row['ROUGE-1'].iloc[0] if isinstance(row['ROUGE-1'], pd.Series) else row['ROUGE-1']),
        'ROUGE-2': float(row['ROUGE-2'].iloc[0] if isinstance(row['ROUGE-2'], pd.Series) else row['ROUGE-2']),
        'ROUGE-L': float(row['ROUGE-L'].iloc[0] if isinstance(row['ROUGE-L'], pd.Series) else row['ROUGE-L']),
        'Cosine_Similarity': float(row['Cosine_Similarity'].iloc[0] if isinstance(row['Cosine_Similarity'], pd.Series) else row['Cosine_Similarity']),
        'BERTScore_F1': float(row['BERTScore_F1'].iloc[0] if isinstance(row['BERTScore_F1'], pd.Series) else row['BERTScore_F1'])
    }
    
    # Output formatted metrics
    print(f"BLEU Score: {metrics['BLEU']:.4f}")
    print(f"ROUGE-1: {metrics['ROUGE-1']:.4f}")
    print(f"ROUGE-2: {metrics['ROUGE-2']:.4f}")
    print(f"ROUGE-L: {metrics['ROUGE-L']:.4f}")
    print(f"Cosine Similarity: {metrics['Cosine_Similarity']:.4f}")
    print(f"BERTScore (F1): {metrics['BERTScore_F1']:.4f}")
    print("-" * 80)

# Output summary statistics
print("\nSummary Statistics:")
metrics_columns = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine_Similarity', 'BERTScore_F1']
summary_stats = samples[metrics_columns].apply(lambda x: pd.to_numeric(x.iloc[0] if isinstance(x, pd.Series) else x)).describe()
print(summary_stats)

Device set to use cuda:0
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and ar


Sample 1:
User: How do you clean the litter box? Is there a liner that fits the box for easy kitty litter clean-up?
Actual Assistant: I have never thought of a liner i just use the large metal scoop and i have it on a cabinet waist high because of my back and i have 6 cats so i only need one litter box i love it and would recommend to everyone
Generated Assistant: The liner is small and does not fit the box. The liner fits easily on the box. I have a cat who does not like to use the liner and this litter box does not fit him. The liner is not very strong so I put it in the box and he does not like to use it.

Metrics:
BLEU Score: 0.0256
ROUGE-1: 0.3429
ROUGE-2: 0.0971
ROUGE-L: 0.2286
Cosine Similarity: 0.7120
BERTScore (F1): 0.8594
--------------------------------------------------------------------------------

Sample 2:
User: Does this product have to be used year round or simply during Flea, Mosquito, Tick season for seasonal areas? I traditionally only when needed.
Actual Assistan