## This notebook contains code for processing the cornell movie dialogues corpus into a dataframe 
please refer to the readme file for additional info on the raw text files 

In [1]:
import re
import pandas as pd
import numpy as np
from ast import literal_eval

In [2]:
# read in movie metadata
with open('/Users/markespina/Downloads/movie-dialogs-corpus/movie_titles_metadata.txt', encoding='ISO-8859-1') as text:
    data = []
    data.append(text.read())

raw_text = data[0]



# remove non-standard separator
text = raw_text.replace('+++$+++', '')


#flag
pattern_year = r'm\d+ \d+|\w+'


# convert data to list to filter for years and movie titles
movie_data = re.findall(pattern_year, text)

In [3]:
movie_data[:2500]

['m0',
 '10',
 'things',
 'i',
 'hate',
 'about',
 'you',
 '1999',
 '6',
 '90',
 '62847',
 'comedy',
 'romance',
 'm1',
 '1492',
 'conquest',
 'of',
 'paradise',
 '1992',
 '6',
 '20',
 '10421',
 'adventure',
 'biography',
 'drama',
 'history',
 'm2',
 '15',
 'minutes',
 '2001',
 '6',
 '10',
 '25854',
 'action',
 'crime',
 'drama',
 'thriller',
 'm3',
 '2001',
 'a',
 'space',
 'odyssey',
 '1968',
 '8',
 '40',
 '163227',
 'adventure',
 'mystery',
 'sci',
 'fi',
 'm4',
 '48',
 'hrs',
 '1982',
 '6',
 '90',
 '22289',
 'action',
 'comedy',
 'crime',
 'drama',
 'thriller',
 'm5',
 'the',
 'fifth',
 'element',
 '1997',
 '7',
 '50',
 '133756',
 'action',
 'adventure',
 'romance',
 'sci',
 'fi',
 'thriller',
 'm6',
 '8mm',
 '1999',
 '6',
 '30',
 '48212',
 'crime',
 'mystery',
 'thriller',
 'm7',
 'a',
 'nightmare',
 'on',
 'elm',
 'street',
 '4',
 'the',
 'dream',
 'master',
 '1988',
 '5',
 '20',
 '13590',
 'fantasy',
 'horror',
 'thriller',
 'm8',
 'a',
 'nightmare',
 'on',
 'elm',
 'street',
 

In [4]:
years =[]
for item in movie_data:
    if item.isdigit() and len(item) ==4:
        years.append(int(item))


# In[35]:


years_df = pd.Series(years).reset_index()
years_df.columns = ['index', 'year']


years = list(years_df[(years_df.year>=1900) & (years_df.year<=2018)].year.unique())
years= [str(x) for x in years]


titles = []
for i, item in enumerate(movie_data):
    if item.startswith('m') and item[-1].isdigit():
        start_idx = i
        for idx, elem in enumerate(movie_data[start_idx+1:]):
            if elem in years:
                end_idx = i+idx+1

                titles.append(movie_data[start_idx:end_idx])
                break

movie_titles = [" ".join(t[1:]) for t in  titles]


print("dialogue from {} movies present in corpus\n\nfirst ten titles:\n".format(len(movie_titles)))

# title contains year, was ignoring by year flag
movie_titles[3] = '2001 a space odyssey'
for title in movie_titles[:10]:
    print(title)

dialogue from 617 movies present in corpus

first ten titles:

10 things i hate about you
1492 conquest of paradise
15 minutes
2001 a space odyssey
48 hrs
the fifth element
8mm
a nightmare on elm street 4 the dream master
a nightmare on elm street the dream child
the atomic submarine


In [5]:
movie_years = []
for i, item in enumerate(movie_data):
    if item in years and (movie_data[i+1].isdigit() or movie_data[i-1].isalpha()): 
        idx = i

        movie_years.append(movie_data[idx])
        

movie_years


['1999',
 '1992',
 '2001',
 '1968',
 '1982',
 '1997',
 '1999',
 '1988',
 '1989',
 '1959',
 '1997',
 '1997',
 '1982',
 '1980',
 '1988',
 '1986',
 '1984',
 '1981',
 '1932',
 '2001',
 '2000',
 '2001',
 '1997',
 '1998',
 '1984',
 '1991',
 '1992',
 '2000',
 '1975',
 '2003',
 '2003',
 '2001',
 '2006',
 '1989',
 '1982',
 '1999',
 '1986',
 '1999',
 '1996',
 '2004',
 '1995',
 '2004',
 '1942',
 '2000',
 '1999',
 '2003',
 '1998',
 '1974',
 '1990',
 '1999',
 '2001',
 '1999',
 '1933',
 '1980',
 '1989',
 '2004',
 '1993',
 '1997',
 '2005',
 '1982',
 '1998',
 '2005',
 '1982',
 '1931',
 '2009',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '1995',
 '2002',
 '2001',
 '1996',
 '1989',
 '1986',
 '2000',
 '1967',
 '1932',
 '1990',
 '1998',
 '1995',
 '1971',
 '1979',
 '1971',
 '1988',
 '2004',
 '1987',
 '2000',
 '1986',
 '1940',
 '1987',
 '2003',
 '2001',
 '1961',
 '2007',
 '1953',
 '1996',
 '1989',
 '1984',
 '1987',
 '1999',
 '2003',
 '1934',
 '1991',
 '1997',
 '1990',
 '2001',
 '1975',
 '2007',
 '1993',
 

In [6]:
scores = []
for i, item in enumerate(movie_data):
    if item in years and movie_data[i+2].isdigit(): 
        idx = i+1
        if movie_data[idx] =='I':
            scores.append(movie_data[idx+1])
        else:
            
            scores.append(movie_data[idx])

len(scores)


617

In [7]:
# flag all movie IDs -- format 'mXX'
m_ids = r"m\d+"
# flag all movie genres in sq brackets
genre_brackets = "[\[].*?[\]]"

## save movie_ids to var
movies_ids = re.findall(m_ids, raw_text)
genres =re.findall(genre_brackets, raw_text)

# look at data
print("movie IDS:\n")
for movie_id in movies_ids[:10]:
    print(movie_id)
print("genre of first 10 movies:\n")
for genre in genres[:10]:
    print(genre)

movie IDS:

m0
m1
m2
m3
m4
m5
m6
m7
m8
m9
genre of first 10 movies:

['comedy', 'romance']
['adventure', 'biography', 'drama', 'history']
['action', 'crime', 'drama', 'thriller']
['adventure', 'mystery', 'sci-fi']
['action', 'comedy', 'crime', 'drama', 'thriller']
['action', 'adventure', 'romance', 'sci-fi', 'thriller']
['crime', 'mystery', 'thriller']
['fantasy', 'horror', 'thriller']
['fantasy', 'horror', 'thriller']
['sci-fi', 'thriller']


In [8]:
# genre seems to be should be list format
# Let's convert with ast module
genre_lists = [literal_eval(genre) for genre in genres]

# extract movie ID - Genres pairs, save to Dict: movie_genres
movie_genres = dict(zip(movies_ids,genre_lists))

movie_genres

{'m0': ['comedy', 'romance'],
 'm1': ['adventure', 'biography', 'drama', 'history'],
 'm2': ['action', 'crime', 'drama', 'thriller'],
 'm3': ['adventure', 'mystery', 'sci-fi'],
 'm4': ['action', 'comedy', 'crime', 'drama', 'thriller'],
 'm5': ['action', 'adventure', 'romance', 'sci-fi', 'thriller'],
 'm6': ['crime', 'mystery', 'thriller'],
 'm7': ['fantasy', 'horror', 'thriller'],
 'm8': ['fantasy', 'horror', 'thriller'],
 'm9': ['sci-fi', 'thriller'],
 'm10': ['drama', 'mystery', 'thriller'],
 'm11': ['action', 'drama', 'thriller'],
 'm12': ['comedy', 'romance', 'sci-fi'],
 'm13': ['comedy', 'romance'],
 'm14': ['crime', 'drama', 'sci-fi', 'thriller'],
 'm15': ['action', 'sci-fi', 'thriller'],
 'm16': ['biography', 'drama', 'music'],
 'm17': ['horror', 'romance'],
 'm18': ['drama'],
 'm19': ['action', 'western'],
 'm20': ['drama', 'thriller'],
 'm21': ['drama', 'thriller'],
 'm22': ['action', 'adventure', 'comedy', 'crime'],
 'm23': ['action', 'adventure', 'thriller'],
 'm24': ['comed

In [9]:
# intialize df of etracted metadata
df = pd.DataFrame(movie_titles, columns=['title'])


# In[45]:


df.shape
df['year'] =movie_years
df['genre'] =genre_lists
df['ids'] = movies_ids
df['rating'] = scores
df.head()

Unnamed: 0,title,year,genre,ids,rating
0,10 things i hate about you,1999,"[comedy, romance]",m0,6
1,1492 conquest of paradise,1992,"[adventure, biography, drama, history]",m1,6
2,15 minutes,2001,"[action, crime, drama, thriller]",m2,6
3,2001 a space odyssey,1968,"[adventure, mystery, sci-fi]",m3,8
4,48 hrs,1982,"[action, comedy, crime, drama, thriller]",m4,6


In [10]:
dialogue_df = pd.read_csv('/Users/markespina/Downloads/movie-dialogs-corpus/movies_dialogs_ordered.csv')
dialogue_df.columns = ['ids','dialogue']

In [11]:
df = df[['ids', 'title', 'year', 'genre', 'rating']]

# final DataFrame
df = df.merge(dialogue_df, on='ids')
df.head()

Unnamed: 0,ids,title,year,genre,rating,dialogue
0,m0,10 things i hate about you,1999,"[comedy, romance]",6,BIANCA Did you change your hair? CHASTITY No...
1,m1,1492 conquest of paradise,1992,"[adventure, biography, drama, history]",6,"ISABEL Is that the man I knew, Treasurer Sanc..."
2,m2,15 minutes,2001,"[action, crime, drama, thriller]",6,EMIL Just do what I do. Say the same thing I...
3,m3,2001 a space odyssey,1968,"[adventure, mystery, sci-fi]",8,"FLOYD How do you do, Mr. Miller? MILLER I'm ..."
4,m4,48 hrs,1982,"[action, comedy, crime, drama, thriller]",6,"GANZ Maybe you shoulda stole a better truck, ..."


In [12]:
# save data to file
df.to_csv('/Users/markespina/Downloads/movie-dialogs-corpus/movies_dialogs_corpus.csv')

In [52]:
df.rating.astype('float64').describe()

count    617.000000
mean       6.403566
std        1.255192
min        2.000000
25%        6.000000
50%        7.000000
75%        7.000000
max        9.000000
Name: rating, dtype: float64

In [51]:
df.rating.describe()

count     617
unique      8
top         7
freq      197
Name: rating, dtype: object