In [3]:
extra_quotes = {
        'the lord of the rings: the fellowship of the ring': ['A wizard is never late, Frodo Baggins. Nor is he early. He arrives precisely when he means to.',
        'You shall not pass!',
        'Fly you fools!',
        'Even the smallest person can change the course of the future.',
        'Many that live deserve death. Some that die deserve life...Do not be too eager to deal out death in judgement. Even the very wise cannot see all ends.',
        'So do all who live to see such times. But that is not for them to decide. All we have to decide is what to do with the time that is given to us.',
        'One ring to rule them all. One ring to find them. One ring to bring them all and in the darkness bind them!',
        'If by my life or death I can protect you, I will. You have my sword'],
        'the lord of the rings: the return of the king': ["Certainty of death. Small chance of success. What are we waiting for?",
            "The journey doesn't end here. Death is just another path... One that we all must take.",
            "I see in your eyes the same fear that would take the heart of me.",
            "But it is not this day!"]
            }

In [9]:
import pandas as pd

class QuoteCleaner:

    def __init__(self, filepath):
        self.df = self.create_quote_dataset(filepath)

        self.df.title = self.df.title.apply(self.title_corrector)


    def create_quote_dataset(self, filepath):
        with open(filepath, 'r') as raw:
            text = raw.readlines()
            lista = [line for line in text if line != '\n']
        
        data = {}
        data['title'] = [lista[i].strip('\n') for i in range(0, len(lista), 3)]
        data['quote'] = [lista[i].strip('\n') for i in range(1, len(lista), 3)]

        return pd.DataFrame(data, index=range(len(data['title'])))

    def title_corrector(self, x):
        data = {'terminator':'the terminator',
                "pirates of the caribbean dead man's chest": "pirates of the caribbean: dead man's chest",
                'the planet of the apes':'planet of the apes',
                'star trek the motion picture': 'star trek: the motion picture',
                'star trek: the wrath of khan':'star trek ii: the wrath of khan',
                'robin hood prince of thieves':'robin hood: prince of thieves',
                'lord of the rings the two towers':'the lord of the rings: the two towers'}
        try:
            return data[x]
        except:
            return x

    def add_quotes(self, extra_quotes):
        for title, quotes in extra_quotes.items():
            for quote in quotes:
                self.df = self.df.append({'title':title, 'quote':quote}, ignore_index=True)

    def save_quote_df(self, path):
        self.df.to_csv(path)
        print('Successfully saved Dataframe.')

In [10]:
import os

root = os.path.dirname(os.getcwd())
raw_path = root + os.sep + 'data' + os.sep + 'moviequotes.memorable_quotes.txt'

q = QuoteCleaner(raw_path)

In [12]:
q.add_quotes(extra_quotes)

In [13]:
q.df

Unnamed: 0,title,quote
0,10 things i hate about you,Who knocked up your sister?
1,10 things i hate about you,"I was watching you out there, before. I've nev..."
2,10 things i hate about you,"You're 18, you don't know what you want. And y..."
3,10 things i hate about you,"Ooh, see that, there. Who needs affection when..."
4,10 things i hate about you,"Just 'cause you're beautiful, that doesn't mea..."
...,...,...
6289,the lord of the rings: the fellowship of the ring,"If by my life or death I can protect you, I wi..."
6290,the lord of the rings: the return of the king,Certainty of death. Small chance of success. W...
6291,the lord of the rings: the return of the king,The journey doesn't end here. Death is just an...
6292,the lord of the rings: the return of the king,I see in your eyes the same fear that would ta...


In [76]:
df = create_quote_dataset('moviequotes.memorable_quotes.txt')
df.title = df.title.apply(title_corrector)
df.head()

Unnamed: 0,title,quote
0,10 things i hate about you,Who knocked up your sister?
1,10 things i hate about you,"I was watching you out there, before. I've nev..."
2,10 things i hate about you,"You're 18, you don't know what you want. And y..."
3,10 things i hate about you,"Ooh, see that, there. Who needs affection when..."
4,10 things i hate about you,"Just 'cause you're beautiful, that doesn't mea..."


In [77]:
len(df)

6282

In [72]:
dic = {
        'the lord of the rings: the fellowship of the ring': ['A wizard is never late, Frodo Baggins. Nor is he early. He arrives precisely when he means to.',
        'You shall not pass!',
        'Fly you fools!',
        'Even the smallest person can change the course of the future.',
        'Many that live deserve death. Some that die deserve life...Do not be too eager to deal out death in judgement. Even the very wise cannot see all ends.',
        'So do all who live to see such times. But that is not for them to decide. All we have to decide is what to do with the time that is given to us.',
        'One ring to rule them all. One ring to find them. One ring to bring them all and in the darkness bind them!',
        'If by my life or death I can protect you, I will. You have my sword'],
        'the lord of the rings: the return of the king': ["Certainty of death. Small chance of success. What are we waiting for?",
            "The journey doesn't end here. Death is just another path... One that we all must take.",
            "I see in your eyes the same fear that would take the heart of me.",
            "But it is not this day!"]
            }

In [81]:
for q in fellowship_quotes:
    df = df.append({'title':'the lord of the rings: the fellowship of the ring', 'quote':q}, ignore_index=True)

for k in king_quotes:
    df = df.append({'title':'the lord of the rings: the return of the king', 'quote':k}, ignore_index=True)

In [82]:
len(df)

6294

In [40]:
titles = pd.read_csv('titles.tsv', sep='\t', low_memory=False)
titles = titles.rename(columns={'originalTitle':'title'})
titles = titles[titles.titleType == 'movie']
titles.title = titles.title.apply(lambda x: x.lower())
titles.head()

Unnamed: 0,Sctconst,titleType,primaryTitle,title,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,the story of the kelly gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,l'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,robbery under arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,amleto,0,1908,\N,\N,Drama


In [35]:
len(titles)

577186

In [83]:
id_df = pd.merge(df, titles, on='title')
id_df.drop_duplicates('quote', inplace=True)
id_df

Unnamed: 0,title,quote,Sctconst,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,10 things i hate about you,Who knocked up your sister?,tt0147800,movie,10 Things I Hate About You,0,1999,\N,97,"Comedy,Drama,Romance"
1,10 things i hate about you,"I was watching you out there, before. I've nev...",tt0147800,movie,10 Things I Hate About You,0,1999,\N,97,"Comedy,Drama,Romance"
2,10 things i hate about you,"You're 18, you don't know what you want. And y...",tt0147800,movie,10 Things I Hate About You,0,1999,\N,97,"Comedy,Drama,Romance"
3,10 things i hate about you,"Ooh, see that, there. Who needs affection when...",tt0147800,movie,10 Things I Hate About You,0,1999,\N,97,"Comedy,Drama,Romance"
4,10 things i hate about you,"Just 'cause you're beautiful, that doesn't mea...",tt0147800,movie,10 Things I Hate About You,0,1999,\N,97,"Comedy,Drama,Romance"
...,...,...,...,...,...,...,...,...,...,...
10863,the lord of the rings: the fellowship of the ring,"If by my life or death I can protect you, I wi...",tt0120737,movie,The Lord of the Rings: The Fellowship of the Ring,0,2001,\N,178,"Action,Adventure,Drama"
10864,the lord of the rings: the return of the king,Certainty of death. Small chance of success. W...,tt0167260,movie,The Lord of the Rings: The Return of the King,0,2003,\N,201,"Action,Adventure,Drama"
10865,the lord of the rings: the return of the king,The journey doesn't end here. Death is just an...,tt0167260,movie,The Lord of the Rings: The Return of the King,0,2003,\N,201,"Action,Adventure,Drama"
10866,the lord of the rings: the return of the king,I see in your eyes the same fear that would ta...,tt0167260,movie,The Lord of the Rings: The Return of the King,0,2003,\N,201,"Action,Adventure,Drama"


In [85]:
id_df.columns

Index(['title', 'quote', 'Sctconst', 'titleType', 'primaryTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [88]:
base = id_df.loc[:, ['title', 'quote', 'Sctconst', 'startYear', 'genres']]
base = base.rename(columns={'Sctconst':'ID', 'startYear':'Year'})
base.to_csv('BASE.csv')

In [89]:
base.head()

Unnamed: 0,title,quote,ID,Year,genres
0,10 things i hate about you,Who knocked up your sister?,tt0147800,1999,"Comedy,Drama,Romance"
1,10 things i hate about you,"I was watching you out there, before. I've nev...",tt0147800,1999,"Comedy,Drama,Romance"
2,10 things i hate about you,"You're 18, you don't know what you want. And y...",tt0147800,1999,"Comedy,Drama,Romance"
3,10 things i hate about you,"Ooh, see that, there. Who needs affection when...",tt0147800,1999,"Comedy,Drama,Romance"
4,10 things i hate about you,"Just 'cause you're beautiful, that doesn't mea...",tt0147800,1999,"Comedy,Drama,Romance"


In [4]:
import pandas as pd

df = pd.read_csv('BASE.csv', index_col=0)
df.drop('Unnamed: 0.1', axis=1, inplace=True)
df.head()

Unnamed: 0,title,quote,ID,Year,Genre
0,10 things i hate about you,Who knocked up your sister?,tt0147800,1999,"Comedy,Drama,Romance"
1,10 things i hate about you,"I was watching you out there, before. I've nev...",tt0147800,1999,"Comedy,Drama,Romance"
2,10 things i hate about you,"You're 18, you don't know what you want. And y...",tt0147800,1999,"Comedy,Drama,Romance"
3,10 things i hate about you,"Ooh, see that, there. Who needs affection when...",tt0147800,1999,"Comedy,Drama,Romance"
4,10 things i hate about you,"Just 'cause you're beautiful, that doesn't mea...",tt0147800,1999,"Comedy,Drama,Romance"


In [5]:
df.rename(columns={'genres':'Genre'}, inplace=True)
df.to_csv('BASE.csv')

In [None]:
from nlp_mining import WordCleaner

cleaner = WordCleaner(df)
expanse = cleaner.expand_df()
expanse.Genre.unique()