# Cornell movie dialog corpus prepocessing
## construct Convo exchanges by replcing list of line id with dialogue!!!

The cornell movie dialog corpus' raw text fomat requires extensive parsing including:
1. extracting metadata and dialog for each movie,
2. cleaning movie dialog of non alpha numeric characters
3. reconstruction of movies' conversations according to Line ID seqencing in 'movie_conversations.txt' 

In [1]:
# boilerplate
import re
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import matplotlib
from ast import literal_eval
from tqdm import tqdm
% matplotlib inline

In [2]:
# read in meta data
with open('/Users/markespina/Downloads/movie-dialogs-corpus/movie_titles_metadata.txt', encoding='ISO-8859-1') as text:
    data = []
    data.append(text.read())

# initialize raw data
raw_text = data[0]

# flag all movie IDs -- format 'mXX'
flag_id = r"m\d+"

# flag all movie genres in sq brackets
flag_brackets = "[\[].*?[\]]"


## save movie_ids to var
movie_ids = re.findall(flag_id, raw_text)
genres =re.findall(flag_brackets, raw_text)



# look at data
print(movie_ids[:5])
print(genres[:5])

['m0', 'm1', 'm2', 'm3', 'm4']
["['comedy', 'romance']", "['adventure', 'biography', 'drama', 'history']", "['action', 'crime', 'drama', 'thriller']", "['adventure', 'mystery', 'sci-fi']", "['action', 'comedy', 'crime', 'drama', 'thriller']"]


In [3]:
# ## genre seems to be should be list format
# Let's convert with ast module
genre_lists = [literal_eval(genre) for genre in genres]

# extract movie ID - Genres pairs, save to Dict: movie_genres
movie_genres = dict(zip(movie_ids,genre_lists))


# display genres
movie_genres

{'m0': ['comedy', 'romance'],
 'm1': ['adventure', 'biography', 'drama', 'history'],
 'm2': ['action', 'crime', 'drama', 'thriller'],
 'm3': ['adventure', 'mystery', 'sci-fi'],
 'm4': ['action', 'comedy', 'crime', 'drama', 'thriller'],
 'm5': ['action', 'adventure', 'romance', 'sci-fi', 'thriller'],
 'm6': ['crime', 'mystery', 'thriller'],
 'm7': ['fantasy', 'horror', 'thriller'],
 'm8': ['fantasy', 'horror', 'thriller'],
 'm9': ['sci-fi', 'thriller'],
 'm10': ['drama', 'mystery', 'thriller'],
 'm11': ['action', 'drama', 'thriller'],
 'm12': ['comedy', 'romance', 'sci-fi'],
 'm13': ['comedy', 'romance'],
 'm14': ['crime', 'drama', 'sci-fi', 'thriller'],
 'm15': ['action', 'sci-fi', 'thriller'],
 'm16': ['biography', 'drama', 'music'],
 'm17': ['horror', 'romance'],
 'm18': ['drama'],
 'm19': ['action', 'western'],
 'm20': ['drama', 'thriller'],
 'm21': ['drama', 'thriller'],
 'm22': ['action', 'adventure', 'comedy', 'crime'],
 'm23': ['action', 'adventure', 'thriller'],
 'm24': ['comed

In [4]:
# read in movie dialog
with open('/Users/markespina/Downloads/movie-dialogs-corpus/movie_titles_metadata.txt', encoding='ISO-8859-1') as text:
    data = []
    data.append(text.read())


raw_text = data[0]

In [5]:
# delete non-conventional separator
text = raw_text.replace('+++$+++', '')


flag_year = r'm\d+ \d+|\w+'



movie_data = re.findall(flag_year, text)

# look at metadata
print(movie_data[:500])

['m0', '10', 'things', 'i', 'hate', 'about', 'you', '1999', '6', '90', '62847', 'comedy', 'romance', 'm1', '1492', 'conquest', 'of', 'paradise', '1992', '6', '20', '10421', 'adventure', 'biography', 'drama', 'history', 'm2', '15', 'minutes', '2001', '6', '10', '25854', 'action', 'crime', 'drama', 'thriller', 'm3', '2001', 'a', 'space', 'odyssey', '1968', '8', '40', '163227', 'adventure', 'mystery', 'sci', 'fi', 'm4', '48', 'hrs', '1982', '6', '90', '22289', 'action', 'comedy', 'crime', 'drama', 'thriller', 'm5', 'the', 'fifth', 'element', '1997', '7', '50', '133756', 'action', 'adventure', 'romance', 'sci', 'fi', 'thriller', 'm6', '8mm', '1999', '6', '30', '48212', 'crime', 'mystery', 'thriller', 'm7', 'a', 'nightmare', 'on', 'elm', 'street', '4', 'the', 'dream', 'master', '1988', '5', '20', '13590', 'fantasy', 'horror', 'thriller', 'm8', 'a', 'nightmare', 'on', 'elm', 'street', 'the', 'dream', 'child', '1989', '4', '70', '11092', 'fantasy', 'horror', 'thriller', 'm9', 'the', 'atomic',

## Messy formatting

Titles precede release year, let's extract years values and use them to find and parse metadata for title extraction

In [6]:
years =[]
for item in movie_data:
    if item.isdigit() and len(item) ==4:
        years.append(int(item))

# initialize dataframe of years for easy filtering
years_df = pd.Series(years).reset_index()
years_df.columns = ['index', 'year']



# filter for integers for existing years
years = list(years_df[(years_df.year>=1900) & (years_df.year<=2018)].year.unique())
years= [str(x) for x in years]


# extract movie titles
titles = []
for i, item in enumerate(movie_data):
    if item.startswith('m') and item[-1].isdigit():
        start_idx = i
        for idx, elem in enumerate(movie_data[start_idx+1:]):
            if elem in years:
                end_idx = i+idx+1

                titles.append(movie_data[start_idx:end_idx])
                break
                
# Check etracted text
print(titles[:5])

[['m0', '10', 'things', 'i', 'hate', 'about', 'you'], ['m1', '1492', 'conquest', 'of', 'paradise'], ['m2', '15', 'minutes'], ['m3'], ['m4', '48', 'hrs']]


In [7]:
movie_titles = [" ".join(t[1:]) for t in  titles]

#look at movies
print(movie_titles)

['10 things i hate about you', '1492 conquest of paradise', '15 minutes', '', '48 hrs', 'the fifth element', '8mm', 'a nightmare on elm street 4 the dream master', 'a nightmare on elm street the dream child', 'the atomic submarine', 'affliction', 'air force one', 'airplane ii the sequel', 'airplane', 'alien nation', 'aliens', 'amadeus', 'an american werewolf in london', 'american madness', 'american outlaws', 'american psycho', 'antitrust', 'austin powers international man of mystery', 'the avengers', 'bachelor party', 'backdraft', 'bad lieutenant', 'bamboozled', 'barry lyndon', 'basic', 'big fish', 'birthday girl', 'black snake moan', 'black rain', 'blade runner', 'blast from the past', 'blue velvet', 'the boondock saints', 'bottle rocket', 'the bourne supremacy', 'braveheart', 'the butterfly effect', 'casablanca', 'cast away', 'the cider house rules', 'confidence', 'croupier', 'dark star', 'dark angel', 'detroit rock city', 'donnie darko', 'drop dead gorgeous', 'duck soup', 'the elep

In [8]:
# fix entry, title contains year, was ignoring by year flag
movie_titles[3] = '2001 a space odyssey'

# intialize dataframe of movies
df = pd.DataFrame(movie_titles, columns=['title'])

print("{} movies in corpus".format(df.shape[0]))

# add metadata
df['genre'] = genre_lists
df['ids'] = movie_ids

#reorder columns
df = df[['ids', 'title', 'genre']]

# look at data
df.head()

617 movies in corpus


Unnamed: 0,ids,title,genre
0,m0,10 things i hate about you,"[comedy, romance]"
1,m1,1492 conquest of paradise,"[adventure, biography, drama, history]"
2,m2,15 minutes,"[action, crime, drama, thriller]"
3,m3,2001 a space odyssey,"[adventure, mystery, sci-fi]"
4,m4,48 hrs,"[action, comedy, crime, drama, thriller]"


In [9]:
# read in dialog
dialogue =[]
with open('/Users/markespina/Downloads/movie-dialogs-corpus/movie_lines.txt', encoding='ISO-8859-1') as f:
    dialogue.append(f.read())

dialogue = dialogue[0]


# Look at text
dialogue[:1000]

'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\nL1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\nL985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\nL984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\nL925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let\'s go.\nL924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\nL872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you\'re gonna need to learn how to lie.\nL871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\nL870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\nL869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\nL868 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ The "real you".\nL867 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ What good stuff?\nL866 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ I figured you\'d get to the good stuff eventual

In [10]:
dialogue = dialogue.replace('+++$+++', '')

unstructured_text = " ".join(dialogue.split('\n'))

print("Unstructered text: \n\n{}".format(unstructured_text[:500]))

print("\ncorpus is {} characters long".format(len(unstructured_text)))

Unstructered text: 

L1045  u0  m0  BIANCA  They do not! L1044  u2  m0  CAMERON  They do to! L985  u0  m0  BIANCA  I hope so. L984  u2  m0  CAMERON  She okay? L925  u0  m0  BIANCA  Let's go. L924  u2  m0  CAMERON  Wow L872  u0  m0  BIANCA  Okay -- you're gonna need to learn how to lie. L871  u2  m0  CAMERON  No L870  u0  m0  BIANCA  I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit? L869  u0  m0  BIANCA  Like my fear of wearing pastels? L868  u2  m0  CAMERON  The "r

corpus is 26109956 characters long


In [27]:

#clean text of unnecessary ids
unnecessary_chars = 'u\d+  m\d+  '
other_char_flag = 'u\d+  '
dialogue = re.sub(unnecessary_chars, '', dialogue)
clean_lines = dialogue.split('\nL')
# get rid of first 'Lid char'
clean_lines[0] = clean_lines[0][1:]

In [42]:
dialog_order =[]
with open('/Users/markespina/Downloads/movie-dialogs-corpus/movie_conversations.txt', encoding='ISO-8859-1') as f:
    dialog_order.append(f.read())

dialog_order = dialog_order[0]





dialog_order = dialog_order.replace('+++$+++', '')
structure = " ".join(dialog_order.split('\n'))





other_chars = r"u\d+  u\d+  "
convo_order = re.sub(other_chars, "", structure)



In [43]:
convo_orders = re.findall(flag_brackets, convo_order)




convo_order_ls = [literal_eval(order) for order in convo_orders]

In [44]:
convo_order_ls[:1000]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366'],
 ['L367', 'L368'],
 ['L401', 'L402', 'L403'],
 ['L404', 'L405', 'L406', 'L407'],
 ['L575', 'L576'],
 ['L577', 'L578'],
 ['L662', 'L663'],
 ['L693', 'L694', 'L695'],
 ['L696', 'L697', 'L698', 'L699'],
 ['L860', 'L861'],
 ['L862', 'L863', 'L864', 'L865'],
 ['L866', 'L867', 'L868', 'L869'],
 ['L870', 'L871', 'L872'],
 ['L924', 'L925'],
 ['L984', 'L985'],
 ['L1044', 'L1045'],
 ['L49', 'L50', 'L51'],
 ['L571', 'L572', 'L573'],
 ['L579', 'L580'],
 ['L595', 'L596', 'L597'],
 ['L598', 'L599', 'L600'],
 ['L659', 'L660'],
 ['L952', 'L953'],
 ['L394', 'L395'],
 ['L396', 'L397'],
 ['L589', 'L590', 'L591'],
 ['L592', 'L593'],
 ['L756', 'L757', 'L758'],
 ['L759', 'L760'],
 ['L164', 'L165'],
 ['L319', 'L320'],
 ['L441', 'L442', 'L443', 'L444', 'L445']

In [13]:
def extract_num(line):
    return int(line.split()[0])

In [14]:
line_nums = list(map(extract_num, clean_lines))

In [15]:
nums = '\d+  '
plain_lines = list(map(lambda x: re.sub(nums, '', x), clean_lines))

In [17]:
temp_df = pd.Series(plain_lines,line_nums).reset_index()

In [18]:
ordered_df = temp_df.sort_values('index', ascending=True)
ordered_df.columns = ['line_id', 'dialogue']

In [19]:
ordered_df['dialogue']

86                        BIANCA  Did you change your hair?
85                                            CHASTITY  No.
84                   BIANCA  You might wanna think about it
648                                  PATRICK  I missed you.
647       MISS PERKY  It says here you exposed yourself ...
646       PATRICK  It was a bratwurst.  I was eating lunch.
645              MISS PERKY  With the teeth of your zipper?
266                               MICHAEL  You the new guy?
265                             CAMERON  So they tell me...
264       MICHAEL  C'mon.  I'm supposed to give you the ...
263                   MICHAEL  So -- which Dakota you from?
262                CAMERON  North, actually.  How'd you   ?
261       MICHAEL  I was kidding. People actually live t...
260       CAMERON  Yeah.  A couple.  We're outnumbered b...
259       MICHAEL  How many people were in your old school?
258                                    CAMERON  Thirty-two.
257                                     

In [20]:
# helper function to parse documents by custom deliminators
def split_text_by(doc, list_of_markers):
    """returns a dictionary of document segments based on list of substring markers,
       the items in the list of markers corresponds to a key in the dictionary
       NOTE: list of markers must occur sequentially in document!"""
    segment_index = dict()
    end = len(doc)
    # find index of last occurance of each movie id in text 
    for marker in list_of_markers[::-1]:
        idx = 0
        segment_index[marker]= doc[idx:].index(marker)
        idx = segment_index[marker]
        
    return segment_index


In [45]:
convo_order[:1000]

"m0  ['L194', 'L195', 'L196', 'L197'] m0  ['L198', 'L199'] m0  ['L200', 'L201', 'L202', 'L203'] m0  ['L204', 'L205', 'L206'] m0  ['L207', 'L208'] m0  ['L271', 'L272', 'L273', 'L274', 'L275'] m0  ['L276', 'L277'] m0  ['L280', 'L281'] m0  ['L363', 'L364'] m0  ['L365', 'L366'] m0  ['L367', 'L368'] m0  ['L401', 'L402', 'L403'] m0  ['L404', 'L405', 'L406', 'L407'] m0  ['L575', 'L576'] m0  ['L577', 'L578'] m0  ['L662', 'L663'] m0  ['L693', 'L694', 'L695'] m0  ['L696', 'L697', 'L698', 'L699'] m0  ['L860', 'L861'] m0  ['L862', 'L863', 'L864', 'L865'] m0  ['L866', 'L867', 'L868', 'L869'] m0  ['L870', 'L871', 'L872'] m0  ['L924', 'L925'] m0  ['L984', 'L985'] m0  ['L1044', 'L1045'] m0  ['L49', 'L50', 'L51'] m0  ['L571', 'L572', 'L573'] m0  ['L579', 'L580'] m0  ['L595', 'L596', 'L597'] m0  ['L598', 'L599', 'L600'] m0  ['L659', 'L660'] m0  ['L952', 'L953'] m0  ['L394', 'L395'] m0  ['L396', 'L397'] m0  ['L589', 'L590', 'L591'] m0  ['L592', 'L593'] m0  ['L756', 'L757', 'L758'] m0  ['L759', 'L760'] m0

In [46]:
# segment text by movie id
convo_index = split_text_by(convo_order, movie_ids)

# inspect dictiony items and if indices match star of each movie
print(list(convo_index.items())[:5])
print(convo_order[convo_index['m616']:])

[('m616', 3871524), ('m615', 3864054), ('m614', 3861716), ('m613', 3856449), ('m612', 3851621)]
m616  ['L666361', 'L666362', 'L666363', 'L666364'] m616  ['L666365', 'L666366', 'L666367', 'L666368'] m616  ['L666462', 'L666463'] m616  ['L666388', 'L666389', 'L666390', 'L666391', 'L666392', 'L666393', 'L666394', 'L666395', 'L666396', 'L666397', 'L666398'] m616  ['L666503', 'L666504', 'L666505'] m616  ['L666246', 'L666247', 'L666248'] m616  ['L666249', 'L666250'] m616  ['L666357', 'L666358'] m616  ['L666480', 'L666481', 'L666482'] m616  ['L666483', 'L666484'] m616  ['L666487', 'L666488'] m616  ['L666251', 'L666252'] m616  ['L666383', 'L666384', 'L666385'] m616  ['L666460', 'L666461'] m616  ['L666485', 'L666486'] m616  ['L666546', 'L666547'] m616  ['L666497', 'L666498', 'L666499', 'L666500', 'L666501', 'L666502'] m616  ['L666262', 'L666263', 'L666264'] m616  ['L666324', 'L666325', 'L666326', 'L666327'] m616  ['L666575', 'L666576'] m616  ['L666256', 'L666257'] m616  ['L666369', 'L666370', 'L

In [47]:
# helper function to extract a single Line ID
def extract_line(pattern, text, start):
    end = len(text)
    '''get raw text, remove trailing non-alpha chars,
       return first character string in sequence'''

    match = re.search(pattern, text[start:end])
    clean_str = str(match)[:-2].split("='")[-1]
    return clean_str

# flag Line ID
flag_line_id = 'L\d+'

# extract ID of first line of dialogue per movie

first_lines =[]
for movie_id, idx in convo_index.items():

    first_lines.append(extract_line(flag_line_id, convo_order, idx))


# IDS extracted in reverse, flip list order
first_lines = first_lines[::-1]
first_lines_ids =list(map(lambda x: int(x[1:]), first_lines))

In [49]:
first_lines_ids

[194,
 2170,
 3380,
 3712,
 5039,
 5643,
 6335,
 8029,
 8419,
 55383,
 12735,
 16651,
 18372,
 19395,
 24411,
 26385,
 31162,
 31661,
 34746,
 35642,
 36712,
 44782,
 56247,
 57755,
 59160,
 60125,
 61728,
 62848,
 64601,
 66017,
 84020,
 85782,
 86905,
 87018,
 88117,
 93271,
 96765,
 102214,
 103068,
 108371,
 109838,
 117629,
 119105,
 122039,
 130810,
 141302,
 148390,
 152060,
 154561,
 163186,
 169004,
 170956,
 172516,
 180345,
 187950,
 194393,
 195409,
 195644,
 203557,
 206496,
 208677,
 210902,
 221471,
 222757,
 225215,
 227810,
 241667,
 247158,
 229121,
 232574,
 233083,
 235683,
 236665,
 237695,
 238802,
 239783,
 242408,
 251294,
 253977,
 255448,
 257466,
 260937,
 267434,
 269546,
 270196,
 273698,
 274146,
 274609,
 279672,
 282051,
 283734,
 285452,
 290246,
 294330,
 295064,
 296483,
 307212,
 298764,
 299617,
 300310,
 302360,
 303685,
 306521,
 310939,
 327838,
 313897,
 315916,
 317475,
 319350,
 329469,
 333811,
 334550,
 336366,
 338171,
 346845,
 339681,
 34

In [51]:
clean_lines[:1000]

['1045  BIANCA  They do not!',
 '1044  CAMERON  They do to!',
 '985  BIANCA  I hope so.',
 '984  CAMERON  She okay?',
 "925  BIANCA  Let's go.",
 '924  CAMERON  Wow',
 "872  BIANCA  Okay -- you're gonna need to learn how to lie.",
 '871  CAMERON  No',
 '870  BIANCA  I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 '869  BIANCA  Like my fear of wearing pastels?',
 '868  CAMERON  The "real you".',
 '867  BIANCA  What good stuff?',
 "866  CAMERON  I figured you'd get to the good stuff eventually.",
 '865  CAMERON  Thank God!  If I had to hear one more story about your coiffure...',
 "864  BIANCA  Me.  This endless ...blonde babble. I'm like, boring myself.",
 '863  CAMERON  What crap?',
 '862  BIANCA  do you listen to this crap?',
 '861  CAMERON  No...',
 '860  BIANCA  Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."',
 '699  CAMERON  You always been this selfish?',
 '698  BIANCA  But',
 "697

In [52]:
# split conversation sequences by movie id with extracted first line IDs
lines_dict = dict()
start, end, x = 0, 0, 0
for i, convo in enumerate(convo_order_ls):
    for line in first_lines:
        if line == 'L194':
            continue
        if line in convo:
            end = i
            lines_dict['m'+str(x)] = convo_order_ls[start:end]
            start = end
            x +=1
            
# add final movie
lines_dict['m616'] = convo_order_ls[start:]

In [53]:
import itertools


In [54]:
def strip_letter(text):
    return int(text[1:])

In [55]:
first_lines_ids = []
for movie_id in movie_ids:
    
    codes = lines_dict[movie_id]
    
    codes =itertools.chain.from_iterable(codes)
    
    # Strip L from id, get lowest index value
    first_idx = min(map(lambda x: strip_letter(x),codes))
    first_lines_ids.append(first_idx)

In [60]:
first_lines_ids

[49,
 1930,
 2557,
 3622,
 4453,
 5396,
 6335,
 7981,
 8362,
 55321,
 12104,
 16572,
 18372,
 19279,
 24144,
 26253,
 29991,
 31352,
 34616,
 35589,
 36554,
 44782,
 55879,
 57464,
 59059,
 59960,
 61711,
 62559,
 64444,
 66017,
 83658,
 85534,
 86108,
 87018,
 87655,
 92354,
 96753,
 101486,
 103068,
 107908,
 109822,
 117610,
 118384,
 122039,
 130796,
 140743,
 148200,
 152048,
 153965,
 162618,
 168570,
 170689,
 172019,
 179846,
 187356,
 194043,
 195181,
 195542,
 203019,
 206423,
 208329,
 210515,
 220965,
 222084,
 224799,
 227435,
 241560,
 247050,
 228848,
 232173,
 232978,
 235544,
 236343,
 237572,
 238736,
 239541,
 242408,
 250520,
 253116,
 255240,
 257393,
 260818,
 267168,
 269326,
 270034,
 273482,
 273889,
 274523,
 279094,
 281828,
 283359,
 284998,
 290042,
 294330,
 295064,
 296402,
 307179,
 298298,
 299241,
 300013,
 302181,
 303665,
 306194,
 310436,
 327491,
 312777,
 315296,
 317328,
 319335,
 329305,
 333563,
 334262,
 336303,
 338145,
 346697,
 339468,
 343

In [61]:
#Delete first id, intialize start at starting index of lines
first_lines_ids.pop(0)
start=49
lines = dict()
x = 0

for num in first_lines_ids:
    
    lines['m'+str(x)]= ordered_df[(ordered_df.line_id>=start)&(ordered_df.line_id<num)]['dialogue']
    x +=1
    start=num
lines['m616'] = ordered_df[(ordered_df.line_id>=start)&(ordered_df.line_id<num)]['dialogue']

In [62]:
ordered_df

Unnamed: 0,line_id,dialogue
86,49,BIANCA Did you change your hair?
85,50,CHASTITY No.
84,51,BIANCA You might wanna think about it
648,59,PATRICK I missed you.
647,60,MISS PERKY It says here you exposed yourself ...
646,61,PATRICK It was a bratwurst. I was eating lunch.
645,62,MISS PERKY With the teeth of your zipper?
266,63,MICHAEL You the new guy?
265,64,CAMERON So they tell me...
264,65,MICHAEL C'mon. I'm supposed to give you the ...


In [67]:
movie_ids

['m0',
 'm1',
 'm2',
 'm3',
 'm4',
 'm5',
 'm6',
 'm7',
 'm8',
 'm9',
 'm10',
 'm11',
 'm12',
 'm13',
 'm14',
 'm15',
 'm16',
 'm17',
 'm18',
 'm19',
 'm20',
 'm21',
 'm22',
 'm23',
 'm24',
 'm25',
 'm26',
 'm27',
 'm28',
 'm29',
 'm30',
 'm31',
 'm32',
 'm33',
 'm34',
 'm35',
 'm36',
 'm37',
 'm38',
 'm39',
 'm40',
 'm41',
 'm42',
 'm43',
 'm44',
 'm45',
 'm46',
 'm47',
 'm48',
 'm49',
 'm50',
 'm51',
 'm52',
 'm53',
 'm54',
 'm55',
 'm56',
 'm57',
 'm58',
 'm59',
 'm60',
 'm61',
 'm62',
 'm63',
 'm64',
 'm65',
 'm66',
 'm67',
 'm68',
 'm69',
 'm70',
 'm71',
 'm72',
 'm73',
 'm74',
 'm75',
 'm76',
 'm77',
 'm78',
 'm79',
 'm80',
 'm81',
 'm82',
 'm83',
 'm84',
 'm85',
 'm86',
 'm87',
 'm88',
 'm89',
 'm90',
 'm91',
 'm92',
 'm93',
 'm94',
 'm95',
 'm96',
 'm97',
 'm98',
 'm99',
 'm100',
 'm101',
 'm102',
 'm103',
 'm104',
 'm105',
 'm106',
 'm107',
 'm108',
 'm109',
 'm110',
 'm111',
 'm112',
 'm113',
 'm114',
 'm115',
 'm116',
 'm117',
 'm118',
 'm119',
 'm120',
 'm121',
 'm122',
 'm

In [68]:
for movie_id in movie_ids:
    lines[movie_id] = " ".join(lines[movie_id])

In [74]:
dialog_df = pd.DataFrame.from_dict(lines, orient='index')
dialog_df.columns = ['dialogue']

In [76]:
dialog_df.to_csv('/Users/markespina/Downloads/movie-dialogs-corpus/movies_dialogs_ordered.csv')