# Capstone: Musical Recommender

Kelly Slatery | US-DSI-10

In [1]:
# Imports
import numpy as np
import pandas as pd

In [2]:
# # Set view options
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

# Import Data

#### allmusicals.com synposes

In [3]:
allmusicals = pd.read_csv('./data/musical_synopses.csv')
allmusicals.shape

(196, 2)

In [4]:
allmusicals.head()

Unnamed: 0,musical,synopsis
0,les misérables,The musical takes place at the beginning of th...
1,the phantom of the opera,At the beginning of the XX century in the Pari...
2,hamilton,Can we call a thing more boring than the histo...
3,west side story,"The two gangs – Jets, consisting of white & Sh..."
4,wicked,Good Witch Glinda of Oz country reported that ...


In [5]:
allmusicals.isnull().sum()

musical      0
synopsis    19
dtype: int64

In [6]:
# Fill null synopses with a space
allmusicals.fillna(' ', inplace=True)
allmusicals.isnull().sum()

musical     0
synopsis    0
dtype: int64

#### Wikipedia summaries

In [7]:
wiki = pd.read_csv('./data/musical_summaries.csv')
wiki.shape                

(196, 2)

In [8]:
wiki.head()

Unnamed: 0,musical,description
0,Les Misérables,Les Misérables (; French pronunciation: ​[le m...
1,The Phantom of the Opera,The Phantom of the Opera is a musical with mus...
2,Hamilton,"Hamilton is a musical with music, lyrics, and ..."
3,West Side Story,West Side Story is a musical with a book by Ar...
4,Wicked,Wicked is a 2003 Broadway musical by Stephen S...


In [9]:
wiki.isnull().sum()

musical        0
description    0
dtype: int64

# Clean Data

In [10]:
# Look at a description
wiki['description'][60]

'Annie Get Your Gun is a musical with lyrics and music by Irving Berlin and a book by Dorothy Fields and her brother Herbert Fields. The story is a fictionalized version of the life of Annie Oakley (1860–1926), a sharpshooter who starred in Buffalo Bill\'s Wild West, and her romance with sharpshooter Frank E. Butler (1847–1926).The 1946 Broadway production was a hit, and the musical had long runs in both New York (1,147 performances) and London, spawning revivals, a 1950 film version and television versions. Songs that became hits include "There\'s No Business Like Show Business", "Doin\' What Comes Natur\'lly", "You Can\'t Get a Man with a Gun", "They Say It\'s Wonderful", and "Anything You Can Do (I Can Do Better)."\n\n\n== History and background ==\nDorothy Fields had the idea for a musical about Annie Oakley, to star her friend, Ethel Merman. Producer Mike Todd turned the project down, so Fields approached a new producing team, Richard Rodgers and Oscar Hammerstein II. After the su

In [11]:
# From look at wikipedia.com results, extract only synopsis/plot section from wikipedia summaries
plot_summaries = []
no_summary = []

for i, summ in enumerate(wiki['description']):
    if '== synopsis' in summ.lower():
        synopsis = summ.split('== Synopsis')[1]
        synopsis = synopsis.split('\n== ')[0]
        synopsis = synopsis.replace('\n', '').replace('=', '')
        plot_summaries.append(synopsis)
    elif '== plot' in summ.lower():
        synopsis = summ.split('== Plot')[1]
        synopsis = synopsis.split('\n== ')[0]
        synopsis = synopsis.replace('\n', '').replace('=', '')
        plot_summaries.append(synopsis)
    elif '== sketches' in summ.lower():
        synopsis = summ.split('== Sketches')[1]
        synopsis = synopsis.split('\n== ')[0]
        synopsis = synopsis.replace('\n', '').replace('=', '')
        plot_summaries.append(synopsis)
    else:
        plot_summaries.append(' ')
        no_summary.append(wiki.loc[i, 'musical'])
        
print(len(plot_summaries))
print(no_summary)

196
['Finding Neverland', 'Chitty Chitty Bang Bang', 'Grey Gardens', 'Flower Drum Song', 'Tarzan of the Apes', "Ain't Misbehavin'", 'Fosse', "Movin' Out", 'Cyrano', "Dancin'"]


In [12]:
# Append filtered synopses to wiki dataframe
wiki['plot_summary'] = plot_summaries

In [13]:
# Look at all musicals with no synopsis from allmusicals.com
allmusicals.loc[allmusicals['synopsis'] == ' ', 'musical'].values

array(['grease', 'beauty and the beast', 'cinderella', 'porgy and bess',
       'once', 'pajama game', 'bonnie & clyde', 'beetlejuice',
       'tuck everlasting', "dr. seuss' how the grinch stole christmas!",
       'oh! calcutta!', 'dogfight', 'twisted', 'shenandoah', 'coco',
       'cyrano', 'ballroom', 'the magic show', "dancin'"], dtype=object)

In [14]:
# Compare missing synopses between allmusicals & wikipedia
remove_for_now = []
for musical in no_summary:
    if musical.lower() in allmusicals.loc[allmusicals['synopsis'] == ' ', 'musical'].values:
        remove_for_now.append(musical)
        print(f'{musical} has no synopsis.')

Cyrano has no synopsis.
Dancin' has no synopsis.


# Concatenate Data

In [15]:
# Rename wiki's 'musical' column
wiki.rename(columns={'musical': 'name'}, inplace=True)

In [16]:
# Combine dataframes
df = pd.concat([wiki, allmusicals], axis=1).drop(columns=['description', 'musical'])

In [17]:
# Remove rows for above musicals with no summary from either source
df = df[~df['name'].isin(remove_for_now)]

In [18]:
# Reset index
df.reset_index(drop=True, inplace=True)
df.tail()

Unnamed: 0,name,plot_summary,synopsis
189,Redhead,"In Victorian London, the plain Essie Whimple ...",The action of the performance takes place in V...
190,Passing Strange,summary Act I The Narrator introduces himsel...,Storyteller told viewers about a life story of...
191,New girl in town,"Anna, a former streetwalker recovering from t...",New York. The beginning of XX century. The cap...
192,Grind,"Summary The PrologueThe singers, dancers, com...",The story begins in the theater. Not ordinary ...
193,Big Deal,In Chicago in the 1930s a group of small-time...,The story of this musical is almost complete r...


# Export Data

In [19]:
df.to_csv('./data/musical_data.csv', index=False)