***Final Team Project: Advanced Generative Chatbot Design***

**Group Name:** ENTROPY

**Group Members:**
1. Nitin Mishra
2. Puneet Chopra
3. Rohan Sharma

**Date:** October 2024

***Project Overview***

- **Goal:** Build a chatbot that can carry out multi-turn conversations, adapt to context, and handle a variety of topics.
- **Output:** A web or app interface where users can converse with the chatbot.

***About the Dataset***

This corpus contains a large metadata-rich collection of fictional conversations extracted from raw movie scripts:
- 220,579 conversational exchanges between 10,292 pairs of movie characters
- involves 9,035 characters from 617 movies
- in total 304,713 utterances



In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


**Step 1: Data Loading**

In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Defining the file paths from Google Drive
movie_titles_file = '/content/drive/MyDrive/group_project_NLP/movie_titles_metadata.txt'
movie_characters_file = '/content/drive/MyDrive/group_project_NLP/movie_characters_metadata.txt'
movie_lines_file = '/content/drive/MyDrive/group_project_NLP/movie_lines.txt'
movie_conversations_file = '/content/drive/MyDrive/group_project_NLP/movie_conversations.txt'

Mounted at /content/drive


**Step 2: Data Exploration and Summarization**

In [None]:
import pandas as pd

# Correct separator and handling encoding issues
movie_titles = pd.read_csv(movie_titles_file, sep=r'\s\+\+\+\$\+\+\+\s', engine='python', header=None, encoding='ISO-8859-1')
movie_titles.columns = ['movieID', 'movie_title', 'movie_year', 'IMDB_rating', 'IMDB_votes', 'genres']

# Preview the movie titles data
print("Movie Titles Metadata:")
print(movie_titles.head())

# Load movie characters data with the correct separator and encoding fix
movie_characters = pd.read_csv(movie_characters_file, sep=r'\s\+\+\+\$\+\+\+\s', engine='python', header=None, encoding='ISO-8859-1')
movie_characters.columns = ['characterID', 'character_name', 'movieID', 'movie_title', 'gender', 'position_in_credits']

# Preview the movie characters data
print("Movie Characters Metadata:")
print(movie_characters.head())

# Load movie lines data with the correct separator and encoding fix
movie_lines = pd.read_csv(movie_lines_file, sep=r'\s\+\+\+\$\+\+\+\s', engine='python', header=None, encoding='ISO-8859-1')
movie_lines.columns = ['lineID', 'characterID', 'movieID', 'character_name', 'text']

# Preview movie lines data
print("Movie Lines:")
print(movie_lines.head())

# Load movie conversations data with the correct separator and encoding fix
movie_conversations = pd.read_csv(movie_conversations_file, sep=r'\s\+\+\+\$\+\+\+\s', engine='python', header=None, encoding='ISO-8859-1')
movie_conversations.columns = ['charID_1', 'charID_2', 'movieID', 'utterance_list']

# Preview movie conversations data
print("Movie Conversations:")
print(movie_conversations.head())

Movie Titles Metadata:
  movieID                 movie_title movie_year  IMDB_rating  IMDB_votes  \
0      m0  10 things i hate about you       1999          6.9       62847   
1      m1  1492: conquest of paradise       1992          6.2       10421   
2      m2                  15 minutes       2001          6.1       25854   
3      m3       2001: a space odyssey       1968          8.4      163227   
4      m4                     48 hrs.       1982          6.9       22289   

                                              genres  
0                              ['comedy', 'romance']  
1     ['adventure', 'biography', 'drama', 'history']  
2           ['action', 'crime', 'drama', 'thriller']  
3                 ['adventure', 'mystery', 'sci-fi']  
4  ['action', 'comedy', 'crime', 'drama', 'thrill...  
Movie Characters Metadata:
  characterID character_name movieID                 movie_title gender  \
0          u0         BIANCA      m0  10 things i hate about you      f   
1      

**Step 3: Data Preprocessing**

In [None]:
import pandas as pd
import nltk
from transformers import GPT2Tokenizer
import multiprocessing
from tqdm import tqdm
import torch

nltk.download('punkt')

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add padding token to the tokenizer (GPT-2 uses no padding token by default)
tokenizer.pad_token = tokenizer.eos_token

# Function to preprocess the text
def preprocess_text(text):
    if pd.isna(text):
        return ""
    return text.lower()

# Apply preprocessing function to the text data (vectorized for speed)
movie_lines['text_clean'] = movie_lines['text'].apply(preprocess_text)

# Print sample preprocessed text
print("Sample preprocessed text:")
print(movie_lines[['text', 'text_clean']].head())

# Tokenization function (for batching)
def tokenize_batch(text_list):
    return tokenizer.batch_encode_plus(
        text_list,
        padding=True,  # Pad sequences to the same length
        truncation=True,  # Truncate sequences that exceed the maximum length
        max_length=512,  # Set a maximum sequence length
        return_tensors='pt'  # Return PyTorch tensors for faster processing
    )

# Tokenize conversations in batches using multiprocessing for efficiency
def tokenize_conversations(conversation_texts):
    # Split the data into manageable chunks (use a chunk size of 10,000 to reduce overhead)
    num_workers = multiprocessing.cpu_count() - 1  # Use all but one CPU core
    batch_size = len(conversation_texts) // num_workers

    # Process the tokenization in parallel using multiprocessing
    with multiprocessing.Pool(processes=num_workers) as pool:
        tokenized_batches = list(
            tqdm(pool.imap(tokenize_batch, [conversation_texts[i:i + batch_size] for i in range(0, len(conversation_texts), batch_size)]),
            total=num_workers)
        )

    # Combine the batches into a single dataset
    input_ids = torch.cat([batch['input_ids'] for batch in tokenized_batches], dim=0)
    attention_masks = torch.cat([batch['attention_mask'] for batch in tokenized_batches], dim=0)

    return input_ids, attention_masks

# Create the conversation text (already optimized)
def get_conversation_lines(utterance_list, lines_df):
    conv_lines = []
    for line_id in eval(utterance_list):  # Convert string list to Python list
        line = lines_df.loc[lines_df['lineID'] == line_id, 'text_clean'].values[0]
        conv_lines.append(line)
    return ' '.join(conv_lines)

# Apply the function to create a new column in movie_conversations with the actual text of the conversations
movie_conversations['conversation_text'] = movie_conversations['utterance_list'].apply(lambda x: get_conversation_lines(x, movie_lines))

# Print sample conversation text
print("Sample conversation text:")
print(movie_conversations[['charID_1', 'charID_2', 'conversation_text']].head())

# Tokenize the entire dataset in parallel using optimized batching
input_ids, attention_masks = tokenize_conversations(movie_conversations['conversation_text'].tolist())

# Print a sample of tokenized input and attention mask
print("Sample tokenized input_ids:")
print(input_ids[:2])
print("Sample attention_mask:")
print(attention_masks[:2])

# Save the tokenized conversations to Google Drive (as before)
import pickle
with open('/content/drive/MyDrive/optimized_tokenized_conversations.pkl', 'wb') as f:
    pickle.dump((input_ids, attention_masks), f)

print("Optimized tokenized conversations saved to Google Drive.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



Sample preprocessed text:
           text    text_clean
0  They do not!  they do not!
1   They do to!   they do to!
2    I hope so.    i hope so.
3     She okay?     she okay?
4     Let's go.     let's go.
Sample conversation text:
  charID_1 charID_2                                  conversation_text
0       u0       u2  can we make this quick?  roxanne korrine and a...
1       u0       u2  you're asking me out.  that's so cute. what's ...
2       u0       u2  no, no, it's my fault -- we didn't have a prop...
3       u0       u2  why? unsolved mystery.  she used to be really ...
4       u0       u2  gosh, if only we could find kat a boyfriend......


100%|██████████| 1/1 [00:47<00:00, 47.56s/it]


Sample tokenized input_ids:
tensor([[ 5171,   356,   787,  ..., 50256, 50256, 50256],
        [ 5832,   821,  4737,  ..., 50256, 50256, 50256]])
Sample attention_mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Optimized tokenized conversations saved to Google Drive.


**Step 4: Model Design and Training**

In [4]:
import pickle
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm  # Progress bar

# Step 1: Load the preprocessed data from Step 3 (from pickle file)
with open('/content/drive/MyDrive/optimized_tokenized_conversations.pkl', 'rb') as f:
    input_ids, attention_masks = pickle.load(f)

print("Tokenized conversations loaded successfully.")

# Step 2: Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add padding token to the tokenizer (GPT-2 uses no padding token by default)
tokenizer.pad_token = tokenizer.eos_token

# Define larger batch size to process more data in parallel (adjust based on GPU memory)
batch_size = 8  # You can try larger sizes depending on your GPU memory (e.g., 8, 16)

# Custom Dataset class for our tokenized conversations
class ConversationDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx]

# Step 3: Create the dataset and dataloader
train_dataset = ConversationDataset(input_ids, attention_masks)

# Custom collate function to handle padding in batches
def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    return input_ids, attention_mask

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Step 4: Define the optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)  # Reduced learning rate for larger batch size

# Step 5: Mixed Precision Training for Speed
scaler = torch.cuda.amp.GradScaler()  # Gradient scaling for mixed precision training

# Step 6: Move the model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Step 7: Training loop with optimizations
model.train()
epochs = 2  # Reduced the number of epochs to fit within time limits
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc="Training", leave=False)

    for batch in progress_bar:
        input_ids, attention_mask = [x.to(device) for x in batch]

        optimizer.zero_grad()

        # Forward pass using mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

        # Backpropagation with gradient scaling
        scaler.scale(loss).backward()

        # Gradient descent step
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})

    print(f"Epoch {epoch+1} completed. Average loss: {epoch_loss/len(train_loader)}")

# Step 8: Save the trained model after optimization
model.save_pretrained("./optimized_movie_chatbot_model")
tokenizer.save_pretrained("./optimized_movie_chatbot_tokenizer")

print("Model training complete and saved!")

Tokenized conversations loaded successfully.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler()  # Gradient scaling for mixed precision training


Epoch 1/2


  with torch.cuda.amp.autocast():


Epoch 1 completed. Average loss: 0.38710993139484223
Epoch 2/2




Epoch 2 completed. Average loss: 0.3575426038988666
Model training complete and saved!


In [5]:
from google.colab import drive
drive.mount('/content/drive')

# Save the trained model and tokenizer to Google Drive
model.save_pretrained('/content/drive/MyDrive/optimized_movie_chatbot_model')
tokenizer.save_pretrained('/content/drive/MyDrive/optimized_movie_chatbot_tokenizer')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/optimized_movie_chatbot_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/optimized_movie_chatbot_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/optimized_movie_chatbot_tokenizer/vocab.json',
 '/content/drive/MyDrive/optimized_movie_chatbot_tokenizer/merges.txt',
 '/content/drive/MyDrive/optimized_movie_chatbot_tokenizer/added_tokens.json')

**Step 5: Evaluation (Generate Responses)**

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load the fine-tuned model and tokenizer from the saved model
model = GPT2LMHeadModel.from_pretrained("./optimized_movie_chatbot_model")
tokenizer = GPT2Tokenizer.from_pretrained("./optimized_movie_chatbot_tokenizer")

# Ensure the model is in evaluation mode
model.eval()

# Move the model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Function to generate chatbot responses with attention mask
def generate_response(prompt_text, model, tokenizer, max_length=150, num_return_sequences=1):
    # Tokenize the input prompt with padding and attention mask
    inputs = tokenizer(prompt_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    # Pass both input_ids and attention_mask to the model
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate responses
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,  # Pass the attention mask to the model
        max_length=max_length,  # Adjust maximum length of generated text
        num_return_sequences=num_return_sequences,  # Generate one or more responses
        no_repeat_ngram_size=2,  # Avoid repeating the same n-grams
        do_sample=True,  # Enable sampling for varied responses
        top_k=50,  # Sample from top k tokens
        top_p=0.95,  # Use nucleus sampling (top-p sampling)
        temperature=0.7,  # Lower temperature makes output more deterministic
        pad_token_id=tokenizer.eos_token_id  # Set the pad token to eos_token_id
    )

    # Decode and return the generated text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the chatbot with a user input prompt
prompt = "Hello, how are you?"
response = generate_response(prompt, model, tokenizer)
print(f"Chatbot response: {response}")

# Try another prompt
prompt = "Can you tell me a joke?"
response = generate_response(prompt, model, tokenizer)
print(f"Chatbot response: {response}")

Chatbot response: Hello, how are you? i'm fine.  i feel like a hero. i know.
Chatbot response: Can you tell me a joke? i don't have to.  i'm just kidding.


In [9]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

**Step 6: Deploy the Chatbot with Gradio**

In [10]:
# Step 6: Deploying the Chatbot with Gradio
# =========================================

import gradio as gr
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Step 1: Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./optimized_movie_chatbot_model")
tokenizer = GPT2Tokenizer.from_pretrained("./optimized_movie_chatbot_tokenizer")

# Ensure the model is in evaluation mode
model.eval()

# Step 2: Move the model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Step 3: Function to generate chatbot responses
def generate_response_gradio(prompt_text):
    # Tokenize the input prompt with padding and attention mask
    inputs = tokenizer(prompt_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate response from the model
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=150,  # Adjust maximum length of generated text
        num_return_sequences=1,  # Generate one response
        no_repeat_ngram_size=2,  # Avoid repeating the same n-grams
        do_sample=True,  # Enable sampling for varied responses
        top_k=50,  # Sample from top k tokens
        top_p=0.95,  # Use nucleus sampling
        temperature=0.7,  # Lower temperature makes output more deterministic
        pad_token_id=tokenizer.eos_token_id  # Set the pad token to eos_token_id
    )

    # Decode and return the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Step 4: Create a Gradio interface
interface = gr.Interface(
    fn=generate_response_gradio,  # The function to call when the user submits input
    inputs="text",  # Input is a text box
    outputs="text",  # Output is also text
    title="Movie Chatbot",
    description="Chatbot based on movie dialogues. Ask it anything!"
)

# Step 5: Launch the Gradio interface
interface.launch(share=True)  # Use share=True to get a public link in Colab

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://5fefad4472e94b7b27.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


