In [76]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to C:\Users\PC\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [77]:
corpus.print_summary_stats()

Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


Creating Speakers

In [63]:
from convokit import Speaker,Utterance
from collections import defaultdict
from tqdm import tqdm
import csv
data_dir = "cornell movie-dialogs corpus/"
with open(data_dir + "movie_characters_metadata.txt", "r", encoding='utf-8', errors='ignore') as f:
    speaker_data = f.readlines()

print(speaker_data)

speaker_meta = {}
for speaker in speaker_data:
    speaker_info = [info.strip() for info in speaker.split("+++$+++")]
    speaker_meta[speaker_info[0]] = {"character_name": speaker_info[1],
                               "movie_idx": speaker_info[2],
                               "movie_name": speaker_info[3],
                               "gender": speaker_info[4],
                               "credit_pos": speaker_info[5]}
corpus_speakers = {k: Speaker(id = k, meta = v) for k,v in speaker_meta.items()}
print("number of speakers in the data = {}".format(len(corpus_speakers)))
corpus_speakers['u0'].meta


['u0 +++$+++ BIANCA +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ f +++$+++ 4\n', 'u1 +++$+++ BRUCE +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ ? +++$+++ ?\n', 'u2 +++$+++ CAMERON +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ m +++$+++ 3\n', 'u3 +++$+++ CHASTITY +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ ? +++$+++ ?\n', 'u4 +++$+++ JOEY +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ m +++$+++ 6\n', 'u5 +++$+++ KAT +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ f +++$+++ 2\n', 'u6 +++$+++ MANDELLA +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ f +++$+++ 7\n', 'u7 +++$+++ MICHAEL +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ m +++$+++ 5\n', 'u8 +++$+++ MISS PERKY +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ ? +++$+++ ?\n', 'u9 +++$+++ PATRICK +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ m +++$+++ 1\n', 'u10 +++$+++ SHARON +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ ? +++$+++ ?\n', 'u11 +++$+++ W

{'character_name': 'BIANCA',
 'movie_idx': 'm0',
 'movie_name': '10 things i hate about you',
 'gender': 'f',
 'credit_pos': '4'}

Creating Utterances

In [64]:
with open(data_dir + "movie_lines.txt", "r", encoding='utf-8', errors='ignore') as f:
    utterance_data = f.readlines()

    utterance_corpus = {}

count = 0
for utterance in tqdm(utterance_data):

    utterance_info = [info.strip() for info in utterance.split("+++$+++")]

    if len(utterance_info) < 4:
        print(utterance_info)

    try:
        idx, speaker, movie_id, text = utterance_info[0], utterance_info[1], utterance_info[2], utterance_info[4]
    except:
        print(utterance_info)

    meta = {'movie_id': movie_id}

    # root & reply_to will be updated later, timestamp is not applicable
    utterance_corpus[idx] = Utterance(id=idx, speaker=corpus_speakers[speaker], text=text, meta=meta)

print("Total number of utterances = {}".format(len(utterance_corpus)))
utterance_corpus['L1044']

100%|██████████| 304713/304713 [01:21<00:00, 3748.33it/s] 

Total number of utterances = 304713





Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': None, 'id': 'u2', 'temp_backend': {}, 'meta': {'character_name': 'CAMERON', 'movie_idx': 'm0', 'movie_name': '10 things i hate about you', 'gender': 'm', 'credit_pos': '3'}}), 'owner': None, 'id': 'L1044', 'temp_backend': {'speaker_id': 'u2', 'conversation_id': None, 'reply_to': None, 'timestamp': None, 'text': 'They do to!'}, 'meta': {'movie_id': 'm0'}})

Updating Root for utterances

In [65]:
with open(data_dir + "movie_conversations.txt", "r", encoding='utf-8', errors='ignore') as f:
    convo_data = f.readlines()
import ast
for info in tqdm(convo_data):

    speaker1, speaker2, m, convo = [info.strip() for info in info.split("+++$+++")]

    convo_seq = ast.literal_eval(convo)

    # update utterance
    conversation_id = convo_seq[0]

    # convo_seq is a list of utterances ids, arranged in conversational order
    for i, line in enumerate(convo_seq):

        # sanity checking: speaker giving the utterance is indeed in the pair of characters provided
        if utterance_corpus[line].speaker.id not in [speaker1, speaker2]:
            print("speaker mismatch in line {0}".format(i))

        utterance_corpus[line].conversation_id = conversation_id

        if i == 0:
            utterance_corpus[line].reply_to = None
        else:
            utterance_corpus[line].reply_to = convo_seq[i-1]

utterance_corpus['L666499']

100%|██████████| 83097/83097 [00:01<00:00, 49926.47it/s]


Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': None, 'id': 'u9028', 'temp_backend': {}, 'meta': {'character_name': 'COGHILL', 'movie_idx': 'm616', 'movie_name': 'zulu dawn', 'gender': '?', 'credit_pos': '?'}}), 'owner': None, 'id': 'L666499', 'temp_backend': {'speaker_id': 'u9028', 'conversation_id': 'L666497', 'reply_to': 'L666498', 'timestamp': None, 'text': 'How quickly can you move your artillery forward?'}, 'meta': {'movie_id': 'm616'}})

In [66]:
utterance_list = utterance_corpus.values()
#By default the version number is incremented
movie_corpus = Corpus(utterances=utterance_list)
print("number of conversations in the dataset = {}".format(len(movie_corpus.get_conversation_ids())))
convo_ids = movie_corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:5]):
    print("sample conversation {}:".format(i))
    print(movie_corpus.get_conversation(convo_idx).get_utterance_ids())

number of conversations in the dataset = 83097
sample conversation 0:
['L1045', 'L1044']
sample conversation 1:
['L985', 'L984']
sample conversation 2:
['L925', 'L924']
sample conversation 3:
['L872', 'L871', 'L870']
sample conversation 4:
['L869', 'L868', 'L867', 'L866']


In [67]:
with open(data_dir + "movie_titles_metadata.txt", "r", encoding='utf-8', errors='ignore') as f:
    movie_extra = f.readlines()
movie_meta = defaultdict(dict)

for movie in movie_extra:
    movie_id, title, year, rating, votes, genre  = [info.strip() for info in movie.split("+++$+++")]
    movie_meta[movie_id] = {"movie_name": title,
                            "release_year": year,
                            "rating": rating,
                            "votes": votes,
                            "genre": genre}

for convo in movie_corpus.iter_conversations():

    # get the movie_id for the conversation by checking from utterance info
    convo_id = convo.get_id()
    movie_idx = movie_corpus.get_utterance(convo_id).meta['movie_id']

    # add movie idx as meta, and update meta with additional movie information
    convo.meta['movie_idx'] = movie_idx
    convo.meta.update(movie_meta[movie_idx])

movie_corpus.get_conversation("L609301").meta


ConvoKitMeta({'movie_idx': 'm570', 'movie_name': 'three kings', 'release_year': '1999', 'rating': '7.30', 'votes': '69757', 'genre': "['action', 'adventure', 'comedy', 'drama', 'war']"})

In [68]:
with open(data_dir + "raw_script_urls.txt", "r", encoding='utf-8', errors='ignore') as f:
    urls = f.readlines()
movie2url = {}
for movie in urls:
    movie_id, _, url = [info.strip() for info in movie.split("+++$+++")]
    movie2url[movie_id] = url
movie_corpus.meta['url'] = movie2url
movie_corpus.meta['name'] = "Cornell Movie-Dialogs Corpus"


In [69]:
from convokit.text_processing import TextProcessor

utt_ids = corpus.get_utterance_ids()
print(utt_ids[:5])

test_utt_id = 'L1045'
utt = corpus.get_utterance(test_utt_id)
print(utt)

def preprocess_text(text):
    text = text.replace(' -- ', ' ')
    return text

['L1045', 'L1044', 'L985', 'L984', 'L925']
Utterance(id: 'L1045', conversation_id: L1044, reply-to: L1044, speaker: Speaker(id: 'u0', vectors: [], meta: ConvoKitMeta({'character_name': 'BIANCA', 'movie_idx': 'm0', 'movie_name': '10 things i hate about you', 'gender': 'f', 'credit_pos': '4'})), timestamp: None, text: 'They do not!', vectors: [], meta: ConvoKitMeta({'movie_id': 'm0', 'parsed': [{'rt': 1, 'toks': [{'tok': 'They', 'tag': 'PRP', 'dep': 'nsubj', 'up': 1, 'dn': []}, {'tok': 'do', 'tag': 'VBP', 'dep': 'ROOT', 'dn': [0, 2, 3]}, {'tok': 'not', 'tag': 'RB', 'dep': 'neg', 'up': 1, 'dn': []}, {'tok': '!', 'tag': '.', 'dep': 'punct', 'up': 1, 'dn': []}]}], 'clean_text': 'They do not!'}))


In [70]:
utt.text

'They do not!'

In [71]:
utt.meta

ConvoKitMeta({'movie_id': 'm0', 'parsed': [{'rt': 1, 'toks': [{'tok': 'They', 'tag': 'PRP', 'dep': 'nsubj', 'up': 1, 'dn': []}, {'tok': 'do', 'tag': 'VBP', 'dep': 'ROOT', 'dn': [0, 2, 3]}, {'tok': 'not', 'tag': 'RB', 'dep': 'neg', 'up': 1, 'dn': []}, {'tok': '!', 'tag': '.', 'dep': 'punct', 'up': 1, 'dn': []}]}], 'clean_text': 'They do not!'})

In [72]:
prep = TextProcessor(proc_fn=preprocess_text, output_field='clean_text')
corpus = prep.transform(corpus)

In [73]:
utt.retrieve_meta('clean_text')

'They do not!'

In [74]:
person_data = defaultdict(list)

for utt in utterance_corpus.values():
    speaker_id = utt.speaker.id
    movie_id = utt.meta.get('movie_id', '')
    conversation_id = utt.conversation_id
    text = utt.text
    clean_text = utt.meta.get('clean_text', '')

    person_data[speaker_id].append({
        'speaker_id': speaker_id,
        'movie_id': movie_id,
        'conversation_id': conversation_id,
        'text': text,
        'clean_text': clean_text
    })

for speaker_id, data_list in person_data.items():
    csv_filename = f"{speaker_id}_data.csv"
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['speaker_id', 'movie_id', 'conversation_id', 'text', 'clean_text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data_list)

In [75]:
topic_lists = []

for convo_id in tqdm(convo_data):
    convo_seq = convo_data[convo_id]
    movie_id = utterance_corpus[convo_seq[0]].meta.get('movie_id', '')
    topic_list = []

    for line in convo_seq:
        utt = utterance_corpus[line]
        # Assuming you have a 'topics' field in the utterance metadata
        topics = utt.meta.get('topics', [])
        topic_list.extend(topics)

    # Remove duplicates from the topic list
    topic_list = list(set(topic_list))

    topic_lists.append({
        'conversation_id': convo_id,
        'movie_id': movie_id,
        'topics': topic_list
    })

# Create a CSV file for topic lists
with open('topic_lists.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['conversation_id', 'movie_id', 'topics']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(topic_lists)

  0%|          | 0/83097 [00:00<?, ?it/s]


TypeError: list indices must be integers or slices, not str

In [None]:
from convokit.text_processing import TextParser
parser = TextParser(input_field='clean_text', verbosity=50)
corpus = parser.transform(corpus)
