Source: https://www.youtube.com/watch?v=UjDpW_SOrlw&ab_channel=freeCodeCamp.org

In [4]:

import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

2023-06-10 14:37:25.790458: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-10 14:37:28.658140: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
DATA_FILE = "../../input/RickAndMortyScripts.csv"
assert os.path.isfile(DATA_FILE)

In [7]:
df = pd.read_csv(DATA_FILE)

In [8]:
df.head()

Unnamed: 0,index,season no.,episode no.,episode name,name,line
0,0,1,1,Pilot,Rick,Morty! You gotta come on. Jus'... you gotta co...
1,1,1,1,Pilot,Morty,"What, Rick? What’s going on?"
2,2,1,1,Pilot,Rick,"I got a surprise for you, Morty."
3,3,1,1,Pilot,Morty,It's the middle of the night. What are you tal...
4,4,1,1,Pilot,Rick,"Come on, I got a surprise for you. Come on, h..."


In [9]:
df.columns

Index(['index', 'season no.', 'episode no.', 'episode name', 'name', 'line'], dtype='object')

In [10]:
clean_df = df.drop(columns=['index', 'season no.', 'episode no.', 'episode name'])

In [11]:
clean_df.head()

Unnamed: 0,name,line
0,Rick,Morty! You gotta come on. Jus'... you gotta co...
1,Morty,"What, Rick? What’s going on?"
2,Rick,"I got a surprise for you, Morty."
3,Morty,It's the middle of the night. What are you tal...
4,Rick,"Come on, I got a surprise for you. Come on, h..."


In [17]:
print(len(clean_df))
print(sum(df['name']=='Rick'))
print(sum(df['name']=='Morty'))


1905
420
347


In [21]:
# Create dialog with context, previous lines are context
contexted = []

context_size = 7
CHARACTER_NAME = 'Rick'
for i in clean_df[df.name==CHARACTER_NAME].index:
    if i < context_size:
        continue
    row = []
    prev = i - 1 -context_size
    for j in range(i, prev, -1):
        row.append(clean_df.line[j])
    contexted.append(row)


columns = ['response', 'context'] 
columns = columns + ['context/' + str(i) for i in range(context_size - 1)]

df = pd.DataFrame.from_records(contexted, columns=columns)

In [22]:
df.sample(6)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
320,"Man, I missed having hands and blood and a sto...",Weird breed.,"Jesus. Jesus Christ. Therapists, man.",Eh.,"Oh, my God, yes! Dad, it's in my purse. Oh, I'...","Sweetie, could I get Get that syringe now?",Are we gonna go back?,"Well, there was so much more at stake. I mean,..."
241,A sequel.,Fine! Excuse me. Coming through. What are you...,"Go in the waiting room, Dad.",What? Every hospital claims to have the best d...,Is he going to die?,"Alright fine, but you're not touching my CRISPR.",You're overreacting!,Not anymore.
358,"Is there coffee? Hey, Morty, can you be a pal?...","Man, fuck you.",Really? You don't say. You would have used a g...,I could've just used a ghost train.,"Hey, I can't help if I can't see.",RICK!,"Uh, my God, that's better.",Uh-huh.
102,"Oh, no, Morty. His subconscious is panicking.",Mrs. Pancakes! AAAAAAAAHHHH!!!!!,Because we're both rational adults that don't ...,"Goldenfold, we're coming out! We just want to ...",Hmm?,I should call Bob Saget. Is that still a thing?,"He's saying ""I love Obama"". So cute! I'm post...",Ooooyayawawa!
179,"Ah, God, gross and weird!",I cannot argue that.,"Alright, hear me out on this. You're immortal,...",You really think I'm that stupid?,What if I told you there's a huge ticket up ri...,You think I wanna be an omniscient immortal be...,Why are you doing this?,"Uh-huh, no motha- no, Earth, dude, I'm talking..."
101,Because we're both rational adults that don't ...,"Goldenfold, we're coming out! We just want to ...",Hmm?,I should call Bob Saget. Is that still a thing?,"He's saying ""I love Obama"". So cute! I'm post...",Ooooyayawawa!,"Aw, he's saying ""I love lasagna"".",Aaaawwaaaaawaawa!


In [23]:
train_df, val_df = train_test_split(df, test_size=0.1)
print(train_df.head())


                                              response  \
290                   Good job, Morty. Let's go, kids.   
151  Morty, hand me that screwdriver, huh?  I'm alm...   
38                 Okay, hold on just a second, Morty.   
368  Look, I'm a lit-- little more complex than you...   
133                         Psst, Beth, Jerry, Summer.   

                                               context  \
290  That was amazing, Morty. Oh, my God. Wow. Okay...   
151       Yeah, I'm just going to...check on your mom.   
38   Are you kidding me?! That's it, Rick! That's t...   
368  Obviously? You came here and defeated our arch...   
133                   This is why I choose to get C's.   

                                             context/0  \
290                       Morty, we just got him back!   
151  Alright, well, I'm gonna go get dressed for th...   
38   Yeah, I can see that. But do you think you'll ...   
368  Obviously, I came here last night during a bla...   
133  I can't

In [None]:
# Create dataset for model
def construct_conversation(row, tokenizer, eos=True):
    flatten = lambda line: 

## Build Model

In [None]:


from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")



Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/351M [00:00<?, ?B/s]