In [10]:
from google.colab import drive
drive.mount('/content/drive')

import torch

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda


In [3]:
import os
import re

# Load the Cornell Movie Dialogs Corpus files
data_dir = '/content/drive/MyDrive/AAI520-NLP/Final_Project/data'
lines_file = os.path.join(data_dir, 'movie_lines.txt')
conversations_file = os.path.join(data_dir, 'movie_conversations.txt')

# Load movie lines and conversations
def load_lines(file_path):
    # Load all the lines from the movie_lines.txt file and store them in a dictionary.
    lines = {}
    with open(file_path, 'r', encoding='iso-8859-1') as file:
        for line in file.readlines():
            parts = line.split(" +++$+++ ")
            if len(parts) == 5:
                # Line ID -> Dialogue text
                lines[parts[0]] = parts[4].strip()
    return lines

def load_conversations(file_path, lines):
    # Load conversations from movie_conversations.txt and match with the corresponding lines.
    conversations = []
    with open(file_path, 'r', encoding='iso-8859-1') as file:
        for line in file.readlines():
            parts = line.split(" +++$+++ ")
            if len(parts) == 4:
                line_ids = eval(parts[3])  # Extract the list of line IDs
                conversation = [lines[line_id] for line_id in line_ids if line_id in lines]
                conversations.append(conversation)
    return conversations

# Load the data
lines = load_lines(lines_file)
conversations = load_conversations(conversations_file, lines)

# Create input-output pairs from conversations
def create_conversation_pairs(conversations, context_size=2):
    """Create input-output pairs from the conversations using a sliding window approach."""
    input_texts = []
    target_texts = []
    for conversation in conversations:
        for i in range(len(conversation) - context_size):
            # Join the context lines as input
            input_text = " ".join(conversation[i:i + context_size])
            # Next line is the target
            target_text = conversation[i + context_size]
            input_texts.append(input_text)
            target_texts.append(target_text)
    return input_texts, target_texts

# Generate input and target pairs
input_texts, target_texts = create_conversation_pairs(conversations, context_size=2)

# Print a sample input-output pair
print("Input:", input_texts[0])
print("Target:", target_texts[0])


Input: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. Well, I thought we'd start with pronunciation, if that's okay with you.
Target: Not the hacking and gagging and spitting part.  Please.


In [5]:
# Use a small sample of the data

input_texts = input_texts[:1000]
target_texts = target_texts[:1000]

In [18]:
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token as eos_token

# Ensure you're using the right device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom Dataset class for tokenized inputs
class CustomDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length=128):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        # Tokenize inputs and targets
        inputs = self.tokenizer(self.input_texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        targets = self.tokenizer(self.target_texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        # Move to the appropriate device (GPU/CPU)
        inputs = {key: value.squeeze(0).to(device) for key, value in inputs.items()}
        targets = {key: value.squeeze(0).to(device) for key, value in targets.items()}

        # Set labels
        inputs['labels'] = targets['input_ids']

        return inputs

# Create the dataset for training and evaluation
train_input_texts = input_texts[:800]
train_target_texts = target_texts[:800]

eval_input_texts = input_texts[800:1000]
eval_target_texts = target_texts[800:1000]

# Instantiate custom datasets
train_dataset = CustomDataset(train_input_texts, train_target_texts, tokenizer)
eval_dataset = CustomDataset(eval_input_texts, eval_target_texts, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",      # Evaluate at the end of each epoch
    save_strategy="epoch",            # Save model at the end of each epoch
    logging_dir="./logs",             # Directory for logs
    logging_steps=10,                 # Log every 10 steps
    per_device_train_batch_size=2,    # Batch size for training
    num_train_epochs=3,               # Number of epochs
    report_to="all",                  # Report to stdout and log file
    load_best_model_at_end=True,      # Load the best model at the end
)

# Data collator for padding the data
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.to(device)  # Move model to the GPU if available




Using device: cuda


In [19]:
# Initialize the Trainer with train and eval datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,      # Use the custom train_dataset
    eval_dataset=eval_dataset,        # Use the custom eval_dataset
    data_collator=data_collator,      # Collate the data with padding
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.3654,3.766725
2,2.5145,3.983235
3,2.5083,4.198936


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1200, training_loss=2.8556591272354126, metrics={'train_runtime': 179.5998, 'train_samples_per_second': 13.363, 'train_steps_per_second': 6.682, 'total_flos': 156775219200000.0, 'train_loss': 2.8556591272354126, 'epoch': 3.0})

In [30]:
# Function to generate responses using the fine-tuned model
def generate_response(prompt, model, tokenizer, max_length=50):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate response with modified settings
    outputs = model.generate(
        inputs,
        max_length=max_length,                 # Limit response length
        pad_token_id=tokenizer.eos_token_id,   # Ensure padding uses EOS token
        no_repeat_ngram_size=3,                # Prevent repeating 3-grams
        top_k=50,                              # Consider top 50 words by probability
        top_p=0.9,                             # Use nucleus sampling with 90% probability mass
        temperature=0.7,                       # Control randomness
        early_stopping=True                    # Stop early at a coherent response
    )

    # Decode the output and stop at the first period, exclamation mark, or question mark
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Stop at the first complete sentence
    for end_char in [".", "!", "?"]:
        if end_char in response:
            response = response.split(end_char)[0] + end_char
            break

    return response

# Sample conversation
user_input = "Hi! How are you?"
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)

user_input = "What's your favorite movie?"
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)

user_input = "Do you like pizza?"
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)



Chatbot: Hi! How are you?  I'm fine.
Chatbot: What's your favorite movie?  You know, I'm not a fan of movies.
Chatbot: Do you like pizza?  I like pizza.
