# Capstone: Musical Recommender

Kelly Slatery | US-DSI-10

In [1]:
# Imports 
import numpy as np
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
import regex as re
import unicodedata

In [2]:
# Set view options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Import Data

In [3]:
# Import list of musicals
names = pd.read_csv('./data/musical_names.csv')
names.shape

(196, 1)

In [4]:
names.head()

Unnamed: 0,musical
0,Les Misérables
1,The Phantom of the Opera
2,Hamilton
3,West Side Story
4,Wicked


In [5]:
# Make all names lowercase (and fix spelling of "Seussical")
names['musical'] = [name.lower().replace('suessical', 'seussical') for name in names['musical']]
names.head()

Unnamed: 0,musical
0,les misérables
1,the phantom of the opera
2,hamilton
3,west side story
4,wicked


# Data Collection

In [6]:
# Set up base url
baseurl1 = 'https://www.allmusicals.com/lyrics/'
baseurl2 = '/synopsis.htm'

In [7]:
# Set up empty list for musical synopses from allmusicals.com
synopses = []
missed_musicals = []

# Loop through all musicals in the list
for name in names['musical']:
    
    # Credits (UnicodeDecodeError): @Vincent Marchetti 10.13.2008, https://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii
    # Credits (unicodedata): @Mini Quark 02.05.2009, https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    try:
        name.encode('utf-8').decode('ascii')
    except UnicodeDecodeError:
        name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('utf-8')
    
    # Remove all punctuation (not a character, number, or space)
    name = re.sub(r'[^a-zA-Z0-9 ]+', '', name)
    
    # Copy original name
    og_name = name
    
    # Handle titles starting with "The"
    if name[:3] == 'the':
        
        # Remove whitespace and make lowercase
        name = re.sub('[^a-zA-Z0-9]', '', name)
        
        # Set up list of links to loop over
        possible_names = [name[3:], name[3:] + 'the', name, name + 'the']
        i = 0        
        url = baseurl1 + possible_names[i] + baseurl2
        res = requests.get(url)

        # Check for which version of the musical name returns a result, if any
        while res.status_code != 200:
            i += 1
            try:
                url = baseurl1 + possible_names[i] + baseurl2
            except IndexError:
                url = ''
                break
            res = requests.get(url)
    
    # Handle titles starting with "A"
    elif name.strip().split()[0] == 'a':
        
        # Remove whitespace and make lowercase
        name = re.sub('[^a-zA-Z0-9]', '', name)
        
        # Set up list of links to loop over
        possible_names = [name[1:], name[1:] + 'a', name, name + 'a']
        i = 0        
        url = baseurl1 + possible_names[i] + baseurl2
        res = requests.get(url)
        
        # Check for which version of the musical name returns a result, if any
        while res.status_code != 200:
            i += 1
            try:
                url = baseurl1 + possible_names[i] + baseurl2
            except IndexError:
                url = ''
                break
            res = requests.get(url)
    
    # Remove "the musical" from names
    elif 'the musical' in name:
        name = name.replace('the musical', '')
        name = re.sub('[^a-zA-Z0-9]', '', name)
        url = baseurl1 + name + baseurl2
        res = requests.get(url)

    # Access webpage for all musicals not containing "The"
    else: 
        name = re.sub('[^a-zA-Z0-9]', '', name)
        url = baseurl1 + name + baseurl2
        res = requests.get(url)
            
    # Try a few more variants of the musical name
    if res.status_code != 200:
        new_name = og_name.split()[0]
        url = baseurl1 + new_name + baseurl2
        res = requests.get(url)
        if res.status_code != 200:
            new_name = ''.join(og_name.split()[:-1])
            url = baseurl1 + new_name + baseurl2
            res = requests.get(url)
            if res.status_code != 200:
                new_name = name + 'themusical'
                url = baseurl1 + new_name + baseurl2
                res = requests.get(url)
                if res.status_code != 200:
                    new_name = name + 'musical'
                    url = baseurl1 + name + baseurl2
                    res = requests.get(url)
                    if res.status_code != 200:
                        new_name = og_name.replace('and', '')
                        new_name = re.sub('[^a-zA-Z0-9]', '', new_name)
                        url = baseurl1 + new_name + baseurl2
                        res = requests.get(url)
                        if res.status_code != 200:
                            synopses.append('')
                            missed_musicals.append(og_name)
                            print(f'NOT LISTED: "{og_name}". Try another source like Wikipedia.')
                            continue

    # For all plausible muical names searches, get the synopsis
    soup = BeautifulSoup(res.content)
    synopsis_html = soup.find('div', {'id': 'page'}).text.replace('\r', '')
    synopsis_list = [words.strip() for words in synopsis_html.split('\n')[2:-2]]
    synopsis = ' '.join(synopsis_list).strip()
    synopses.append(synopsis)
    print(f'Synopsis for "{name}" added to list. Moving on to the next musical...')

Synopsis for "lesmiserables" added to list. Moving on to the next musical...
Synopsis for "thephantomoftheopera" added to list. Moving on to the next musical...
Synopsis for "hamilton" added to list. Moving on to the next musical...
Synopsis for "westsidestory" added to list. Moving on to the next musical...
Synopsis for "wicked" added to list. Moving on to the next musical...
Synopsis for "chicago" added to list. Moving on to the next musical...
Synopsis for "rent" added to list. Moving on to the next musical...
Synopsis for "thelionking" added to list. Moving on to the next musical...
Synopsis for "thebookofmormon" added to list. Moving on to the next musical...
Synopsis for "sweeneytoddthedemonbarberoffleetstreet" added to list. Moving on to the next musical...
Synopsis for "thesoundofmusic" added to list. Moving on to the next musical...
Synopsis for "fiddlerontheroof" added to list. Moving on to the next musical...
Synopsis for "intothewoods" added to list. Moving on to the next m

Synopsis for "beautifulthecarolekingmusical" added to list. Moving on to the next musical...
Synopsis for "brigadoon" added to list. Moving on to the next musical...
Synopsis for "onthetown" added to list. Moving on to the next musical...
Synopsis for "thewhostommy" added to list. Moving on to the next musical...
Synopsis for "bemorechill" added to list. Moving on to the next musical...
Synopsis for "thewiz" added to list. Moving on to the next musical...
Synopsis for "funhome" added to list. Moving on to the next musical...
Synopsis for "catchmeifyoucan" added to list. Moving on to the next musical...
Synopsis for "sweetcharity" added to list. Moving on to the next musical...
Synopsis for "mame" added to list. Moving on to the next musical...
Synopsis for "sevenbridesforsevenbrothers" added to list. Moving on to the next musical...
Synopsis for "sisteract" added to list. Moving on to the next musical...
Synopsis for "shelovesme" added to list. Moving on to the next musical...
Synopsis

In [8]:
# Check that length of synopses is 196
print(len(synopses))

# Look at which musicals are missing synopses
print(missed_musicals)

196
['grease', 'beauty and the beast', 'cinderella', 'porgy and bess', 'once', 'pajama game', 'bonnie  clyde', 'beetlejuice', 'tuck everlasting', 'dr seuss how the grinch stole christmas', 'oh calcutta', 'dogfight', 'twisted', 'shenandoah', 'coco', 'cyrano', 'ballroom', 'the magic show', 'dancin']


In [9]:
# How many musicals were missed?
len(missed_musicals)

19

# Export Data

In [10]:
# Add synopses to dataframe
names['synopsis'] = synopses
names.head(2)

Unnamed: 0,musical,synopsis
0,les misérables,"The musical takes place at the beginning of the XIX century. After 19 years of penal servitude, Jean Valjean is released by policeman Javert. He was arrested for theft of bread for his family. Every month he has to report to the lawyer. Jean isn't hired and despised. Once, he was sheltered by the archbishop. He treated him as brother. Jean Valjean, who still doesn't believe in love, steals all silver in the house and runs. In the morning, he is brought to the archbishop. The man is beat to semi-death. According to policemen, the man has told them that silver was presented him by the archbishop. He agrees with it and gives Jean two silver candlesticks, which Jean stores to his death. The man is so touched by care that he tears all his documents and begins new life. Jean Valjean sells all silver and becomes the mayor 8 years later. He is also an owner of the factory. Poor girl Fantine works on it. Fantine is exposed to harassment of the foreman and contempt of colleagues. She has a secret. A few years ago, the man has deceived her and has left. She became pregnant and has given birth to the daughter. The girl, Kozetta, grows at the innkeeper and his wife's family. They have their own daughter too. They cruelly manage with the girl. At the same time, the family writes to her mother and swears that she is often ill. Fantine sends them a lot of money and thinks that she rescues the daughter. Soon her secret is revealed by other workers. The mayor asks the foreman to resolve the conflict. Fantine is dismissed. Despaired woman sells her hair and teeth. She hopes to rescue her ""ill"" daughter. Then she becomes a prostitute. One night, when the client comes to her, she refuses to work. The man offends her and Fantine hits him. At this moment police officers appear, including Javert. Jean Valjean rescues Fantine from prison and carries in hospital. The woman asks him to take care of the daughter and dies. Javert understands that the mayor is Jean, and wants to put him in prison as he looked for fugitive for many years. Jean is escaped. At this time, Kozetta is sent to winter forest for water, where she is found by Valjean. He redeems the girl from innkeepers and asks the church gardener for a shelter for her. The girl lives in the monastery. Times of June revolt. In the center of a plot are the students who have started revolution. Little Gavroche, the son of impoverished innkeepers, helps them. Marius, one of the associates, notices Kozetta on the street and falls in love with her. The girl reciprocates the young man. By means of Eponine, who is also in love with Marius, he finds the house where lives Kozetta. They start to date in secret from everyone. Jeanne Valjean and Kozetta are forced to run because of Javert arrival. Gavroche brings them a note from barricades. To rescue Kozetta's beloved, the man goes to the heat of revolt. Just at this moment, students detain Javert. They allow Jean Valjean to kill him. He agrees, and doesn't kill Javert, but releases him on freedom. At this time on barricades, slaughter is started again. Gavroche and Eponina are killed during fight. When fight is resumed, everyone is in mourning. This time law enforcement authorities try to kill everyone. Jean Valjean rescues wounded Marius. During this he meets the innkeeper on the road (he has stolen a family ring from Marius). Valjean and half-dead Marius are awaited by Javert. He claims that he will arrest the man for all crimes, which Jean has made. However, he doesn't decide to shoot at the person, who has saved his life recently. The guard is exposed to remorse and commits suicide. Marius recovers. He gets married with Kozetta. Jean Valjean tells the guy his story and departs to the monastery to die. The innkeeper and his wife come to a wedding and tell Marius scurrilous things about Jean Valjean. They don't understand that they speak about the one who has rescued him. Marius and Kozetta run to the monastery, finding the last minutes of Jean's life. When he dies, he is met by Fantine, the archbishop and all dead on barricades. The last song sounds."
1,the phantom of the opera,"At the beginning of the XX century in the Paris Opera House was selling of old props. The most mysterious lot was broken chandelier. With its fall relates the story of the mysterious Phantom, who many years ago was the music patron of this place. Chandelier again rose to the ceiling of the theater & time turned back. It was 1881. In the hall was rehearsed the evening performance. Appeared Phantom so angered diva Carlotta that she categorically refused to act. New theater owners replaced the lead actress. The young chorus girl Christine knew a whole party of diva. She perfectly performed in the evening & very surprised new owners. After a triumphant debut of Christine, she told her friend about the mysterious music angel, who taught her singing. Soon, to her came a new sponsor of the theater Viscount & recognized a girl – in a childhood, they often played together. A girl told to a young man her deceased father’s telling, which included such music angel. A girl also admitted that this mysterious patron taught her singing. Laughed at her imagination, he invited her to dine. When the young man left the dressing room, in the mirror appeared Phantom. He took her into the mysterious world, which was located deep underground. Teaching his protégé singing, the patron brought the girl to his home & when she fainted, he gently laid her down on his bed. Waking up, she went to an angel & took off his mask to see the true face of her patron. Soon theater owners received a note from the Phantom. He demanded that the main role in the new opera performed Christine. Otherwise, the theater would suffer a terrible disaster. Owners calmed down furious diva, assuring her in her irreplaceability. During her performance, Phantom made so that the singer lost her voice. Trying to save the play, owners let the ballet on the stage. But then the audience saw the corpse under the ceiling, hanging on a rope & heard a sinister laugh of Phantom. Using the general turmoil, Christine took Mr. Chagny to the roof of the theater, where they explained to each other in love. Overhearing their conversation, Phantom has vowed to avenge to them two. After 6 months, Phantom visited Masquerade. He brought the owners a new musical creation & demanded a main part for Christine. Mrs. Giry told to viscount a story of Phantom. He was originally born with a disfigured face, the boy grew up in a mobile waxworks. When he managed to escape from there, the teenager found refuge in the theater. There he realized that his calling is to compose music. Viscount decided to catch the Phantom. Christine was choosing between her beloved & teacher. Visiting her father’s grave, the girl almost fell under the spell of Phantom, but she was rescued by viscount. During the premiere, Christine realized that her partner should be Phantom. When she tore off his mask, the audience saw the mutilated face of a man & he kidnapped a girl from the stage. Viscount was in search of his beloved, when Mrs. Giry showed a way to the dungeon, where Christine was forced to wear a wedding dress. Phantom soon caught the Viscount, who found a way to Phantom’s habitat. A man said that would let go Viscount, if only a girl stays with him. Otherwise her lover would die. Realizing feelings of Phantom, Christine kissed him. Having experienced the first time in life someone else's compassion, Phantom was shocked. He told the girl that he loves her, but she forced herself to turn away from her mentor. Christine & Viscount left. Closed by pallium, Phantom started to cry. Soon, in his lair broke angry mob. But under the pallium, people only found mask of a men – Phantom disappeared."


In [11]:
# Export new dataframe
names.to_csv('./data/musical_synopses.csv', index=False)