# Capstone: Musical Recommender

Kelly Slatery | US-DSI-10

In [1]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import spacy
import en_core_web_lg
from textblob import TextBlob, Word

# Import Data

In [2]:
df = pd.read_csv('./data/musical_sentiments.csv')
df.shape

(194, 7)

In [3]:
df.head()

Unnamed: 0,name,combined,clean_combined,vectors,km_labels,db_labels,sentiment
0,Les Misérables,"Act I In 1815 France, prisoners work at hard l...",France prisoner work hard labour work song yea...,[-2.55308696e-03 1.25066981e-01 -3.78664001e-...,1,0,0.031586
1,The Phantom of the Opera,"Prologue In 1911 Paris, the Paris Opéra hosts ...",prologue Paris Paris Opera host auction old th...,[ 4.53058779e-02 4.77313697e-02 -2.63329875e-...,3,0,0.001796
2,Hamilton,The musical details Hamilton's life in two act...,musical detail Hamilton life act historical ch...,[-6.79643005e-02 1.21977791e-01 1.33586982e-...,2,0,0.105111
3,West Side Story,"Act 1 Two rival teenage gangs, the Jets (White...",rival teenage gang jet White Americans Sharks ...,[-3.16287242e-02 6.83641210e-02 -4.60400395e-...,6,0,0.107123
4,Wicked,"Act I In the Land of Oz, the Ozians are rejoic...",Land Oz Ozians rejoice demise Elphaba Wicked W...,[ 1.18018501e-02 5.79135790e-02 -1.84583035e-...,3,0,0.151201


# Process Data

In [4]:
nlp = en_core_web_lg.load()

In [5]:
df['docs'] = [nlp(summ) for summ in df['clean_combined']]

# Export Data

In [6]:
df[['name', 'docs', 'sentiment']].to_csv()



# Build Recommender

In [7]:
# Define a function to take in user input and output a spaCy doc
def convert_to_doc(user_input):
    doc = nlp(user_input)
    return doc

In [8]:
# Define a function to take in user input and output its TextBlob sentiment
def get_sentiment(user_input):
    return TextBlob(user_input).sentiment.polarity

In [9]:
# Define a function to find each musical summary's sentiment and respective cosine similarity with the user input
def similarity_and_sentiments(user_input, df=df, summ_col='docs', sentiment_col='sentiment', name_col='name'):
    
    # Convert user input to a spaCy doc
    user_input_doc = convert_to_doc(user_input)
    
    # Calculate similarity with each musical and create dicitonary of similarities and sentiments
    similarity_dict = {}
    for i, summ in enumerate(df[summ_col]):
        sim = user_input_doc.similarity(summ)
        similarity_dict[sim] = [df[sentiment_col][i], df[name_col][i]]
    
    return similarity_dict

In [10]:
# Define a function to sort the list musical similarities and pull out top ten
def top_ten(similarity_dict):
    
    # Sort the musicals by similarity
    in_order = sorted(similarity_dict.items())
    
    # Consider only top 10 most similar musicals before including sentiment
    num_to_consider = 10
    
    # Pull out top 10 most similar musicals
    sentiment_list = []
    for i in range(num_to_consider):
        sentiment_list.append(in_order[-num_to_consider:][i][1])
    
    return sentiment_list

In [11]:
# Define a function to 
def get_recommendations(sentiment_list, user_input):
    
    # Calculate sentiment of user input
    user_sentiment = get_sentiment(user_input)
    
    # Pull out top 3 musicals with closest sentiment rating in either direction
    sentiment_differences = []
    for sentiment, musical in sentiment_list:
        diff = np.abs(user_sentiment - sentiment)
        sentiment_differences.append([diff, musical])
    
    # Extract musical names, in order
    top_three = sorted(sentiment_differences[:3])
    final_recommendations = []
    for sentiment, musical in top_three:
        final_recommendations.append(musical)
        
    return final_recommendations

In [12]:
# Define a function to take in user input and output 3 musical recommendations
def recommend(user_input):
    similarity_dict = similarity_and_sentiments(user_input)
    sentiment_list = top_ten(similarity_dict)
    final_recommendations = get_recommendations(sentiment_list, user_input)
    return final_recommendations

# Evaluate Recommender

#### Example 1

In [13]:
# Example prompt
prompt = """I feel kind of like trash. Today's been bad and my expectations haven't been met in a variety of ways. 
            I feel angered and honestly somewhat disrespected, and mostly in the mood to just relax and vent my
            emotions."""

In [14]:
# Get recommendations
recs_1 = recommend(prompt)
recs_1

["The Band's Visit", 'American Idiot', 'Rock of Ages']

In [15]:
# Look at recommended musicals' clusters
df.loc[df['name'].isin(recs_1), ['name', 'km_labels', 'db_labels']]

Unnamed: 0,name,km_labels,db_labels
101,American Idiot,4,0
135,Rock of Ages,5,0
139,The Band's Visit,5,0


#### Example 2

In [16]:
# Example prompt
prompt2 = '''I am feeling anxious but also excited. There is a lot of uncertainty in the world right now, 
with the coronavirus and the Trump presidency/election news. It's hard to feel settled. But we're
also doing fun and great things that are wonderful-- watching Kelly discover new skills and plan the next 
steps in her life, changing (drastically) the look of the home we've lived in for 17 years and planning 
exciting trips.
'''

In [17]:
# Get recommendations
recs_2 = recommend(prompt2)
recs_2

['Chitty Chitty Bang Bang', 'Come from Away', 'Urinetown']

In [18]:
# Look at recommended musicals' clusters
df.loc[df['name'].isin(recs_2), ['name', 'km_labels', 'db_labels']]

Unnamed: 0,name,km_labels,db_labels
76,Come from Away,5,0
85,Urinetown,5,0
94,Chitty Chitty Bang Bang,5,0


#### Example 3

In [19]:
# Example prompt
prompt3 = """I have no confidence in technology. I want to quit my job because of politics. 
My family thinks I am lazy at computers. I am tired of being disappointed in the country. I have no idea
what I will do in retirement. I am uncertain of the future and it makes me anxious.
"""

In [20]:
# Get recommendations
recs_3 = recommend(prompt3)
recs_3

['Urinetown', 'Next to Normal', 'The 25th Annual Putnam County Spelling Bee']

In [21]:
# Look at recommended musicals' clusters
df.loc[df['name'].isin(recs_3), ['name', 'km_labels', 'db_labels']]

Unnamed: 0,name,km_labels,db_labels
42,Next to Normal,4,0
85,Urinetown,5,0
95,The 25th Annual Putnam County Spelling Bee,5,0


As hoped, of the top three recommendations, at least 2 come from the same cluster from our earlier clustering models, which only clustered based on cosine similarity, without including any sentiment analysis. From my knowledge as a musical fan, these recommendations are extremely fitting, in terms of mood and topic.