# Capstone: Musical Recommender

Kelly Slatery | US-DSI-10

In [9]:
# !python -m spacy download en_vectors_web_lg

In [69]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

import gensim
from gensim.models import Word2Vec
import time

import spacy
import en_vectors_web_lg

# Import Data

In [52]:
df = pd.read_csv('./data/musical_data.csv')
df.shape

(194, 3)

In [53]:
df.head()

Unnamed: 0,name,plot_summary,synopsis
0,Les Misérables,"Act I In 1815 France, prisoners work at hard...",The musical takes place at the beginning of th...
1,The Phantom of the Opera,"Prologue In 1911 Paris, the Paris Opéra host...",At the beginning of the XX century in the Pari...
2,Hamilton,The musical details Hamilton's life in two ac...,Can we call a thing more boring than the histo...
3,West Side Story,"Act 1 Two rival teenage gangs, the Jets (Whi...","The two gangs – Jets, consisting of white & Sh..."
4,Wicked,"Act I In the Land of Oz, the Ozians are rejo...",Good Witch Glinda of Oz country reported that ...


In [54]:
df.isnull().sum()

name            0
plot_summary    0
synopsis        0
dtype: int64

In [56]:
df[df['plot_summary'] == ' '].count()

name            8
plot_summary    8
synopsis        8
dtype: int64

In [57]:
df[df['synopsis'] == ' '].count()

name            17
plot_summary    17
synopsis        17
dtype: int64

In [59]:
df[(df['plot_summary'] == ' ') | (df['synopsis'] == ' ')].count()

name            25
plot_summary    25
synopsis        25
dtype: int64

# Data Cleaning

In [65]:
# Strip all leading and trailing spaces
df['plot_summary'] = [summ.strip() for summ in df['plot_summary']]
df['synopsis'] = [synopsis.strip() for synopsis in df['synopsis']]
df.head()

Unnamed: 0,name,plot_summary,synopsis
0,Les Misérables,"Act I In 1815 France, prisoners work at hard l...",The musical takes place at the beginning of th...
1,The Phantom of the Opera,"Prologue In 1911 Paris, the Paris Opéra hosts ...",At the beginning of the XX century in the Pari...
2,Hamilton,The musical details Hamilton's life in two act...,Can we call a thing more boring than the histo...
3,West Side Story,"Act 1 Two rival teenage gangs, the Jets (White...","The two gangs – Jets, consisting of white & Sh..."
4,Wicked,"Act I In the Land of Oz, the Ozians are rejoic...",Good Witch Glinda of Oz country reported that ...


In [66]:
# Create a column combining both summaries/synopses
df['combined'] = df['plot_summary'] + df['synopsis']
df.head()

Unnamed: 0,name,plot_summary,synopsis,combined
0,Les Misérables,"Act I In 1815 France, prisoners work at hard l...",The musical takes place at the beginning of th...,"Act I In 1815 France, prisoners work at hard l..."
1,The Phantom of the Opera,"Prologue In 1911 Paris, the Paris Opéra hosts ...",At the beginning of the XX century in the Pari...,"Prologue In 1911 Paris, the Paris Opéra hosts ..."
2,Hamilton,The musical details Hamilton's life in two act...,Can we call a thing more boring than the histo...,The musical details Hamilton's life in two act...
3,West Side Story,"Act 1 Two rival teenage gangs, the Jets (White...","The two gangs – Jets, consisting of white & Sh...","Act 1 Two rival teenage gangs, the Jets (White..."
4,Wicked,"Act I In the Land of Oz, the Ozians are rejoic...",Good Witch Glinda of Oz country reported that ...,"Act I In the Land of Oz, the Ozians are rejoic..."


# Trial Text Parsing

In [7]:
# Credits: Code from GA 8.07-lesson-word-vectors by Matt Brems
# Start timer.
t0 = time.time()

# Import word vectors into "model"
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/kelly/Code/data_files/lexvec.enwiki+newscrawl.300d.W.pos.vectors')

# Print results of timer.
print(time.time() - t0)

83.64822506904602


In [11]:
# Load Spacy's 300-dimensional GloVe vectors for over 1 million terms of English
# Credits: https://spacy.io/usage/vectors-similarity
nlp = en_vectors_web_lg.load()

In [14]:
# Explore Les Miserables vs. Phantom of the Opera similarity (should be quite similar)
# Credits: https://spacy.io/usage/vectors-similarity
first_summ = nlp(df['combined'][0])
second_summ = nlp(df['combined'][1])
first_summ.similarity(second_summ)

0.9832441745132257

In [51]:
# Look at the beginningof the vectorized form of the first summary
print(len(first_summ))
first_summ.vector[:50]

3845


array([ 1.3268165e-02,  1.7136703e-01, -9.3956098e-02, -4.3228902e-02,
        5.9436657e-02, -7.8738332e-03,  6.1197202e-03, -9.8434336e-02,
       -5.4305557e-02,  2.0646520e+00, -1.3217980e-01,  5.7758037e-02,
        2.3977818e-02, -7.6769933e-02, -1.4613271e-01, -1.0162838e-03,
       -6.8620749e-02,  7.1850091e-01, -1.2066544e-01, -2.2263097e-02,
       -1.6781308e-02, -6.0979396e-02, -6.7438938e-02, -1.2512242e-02,
        3.9138198e-03,  2.5537251e-02, -6.4160138e-02, -2.5191054e-02,
       -1.7199093e-02, -5.6538537e-02, -4.8680976e-02,  4.2920690e-02,
       -1.1270118e-01,  1.0742278e-01,  1.0010578e-01, -1.0242986e-01,
       -1.7652318e-02, -4.0684219e-02, -6.9165886e-03, -1.1480066e-02,
        1.9567847e-02,  1.8294971e-02,  3.6581561e-02, -9.3184873e-02,
        3.1907316e-02,  4.9237642e-03, -9.8600358e-02,  2.8881010e-02,
       -8.9285700e-03,  1.7194759e-02], dtype=float32)

In [41]:
# Explore Les Miserables vs. all other synopses' similarities (should not all be as high)
similarities = {}
lesmis = nlp(df['combined'][0])
for i, summ in enumerate(df['combined']):
    summ = nlp(summ)
    similarity = lesmis.similarity(summ)
    similarities[df['name'][i]] = similarity

In [42]:
# Look at all similarities under .95
[print(key, value) for key, value in similarities.items() if value < .95];

Fosse 0.94777400073351
Coco 0.9419108036631528


In [48]:
# Look at top 5 most similar musicals to Les Miserables
[print(key, value) for key, value in similarities.items() if value > .987];

Les Misérables 1.0
Spring Awakening 0.9887492997502907
Pippin 0.9873893923835498
Ragtime 0.9879187386125562
Carousel 0.987595750828035
Man of La Mancha 0.9872949756419365
Assassins 0.9877956099148062
A Funny Thing Happened on the Way to the Forum 0.9870503570441587
American Idiot 0.9877180639810607
A Gentleman's Guide to Love and Murder 0.9882046842230645
Aida 0.9883073948730837
The Scarlet Pimpernel 0.9872025528294902


Similarities are most likely all so high because stop words have not been removed yet. Because so many irrelevant words are being factored in, document similarity is being assessed more based on writing style than content. Remove stop words and look again.

# Data Cleaning: Stop Words (nltk)

The downsides to using this method to remove stopwords are (1) we didn't define the stopwords, so some more important words might be removed, and at the same time, this corpus might not be extensive enough, and (2) to remove stopwords, we have to tokenize our summaries, or break them into lists, which takes away from SpaCy's capability to classify them later on.

In [62]:
# Import nltk stopwords
from nltk.corpus import stopwords

In [78]:
# Remove all punctuation and split into tokens
tokenizer = RegexpTokenizer(r'\w+')
tokenized_summs = []
for i in range(len(df)):
    tokenized_summ = tokenizer.tokenize(df['combined'][i].lower())
    tokenized_summ = [token for token in tokenized_summ if token.isalpha()]
    tokenized_summs.append(tokenized_summ)
tokenized_summs[0][:10]

['act',
 'i',
 'in',
 'france',
 'prisoners',
 'work',
 'at',
 'hard',
 'labour',
 'work']

In [79]:
# Remove nltk stopwords
tokenized_no_sw = []
for i in range(len(tokenized_summs)):
    no_sw = [token for token in tokenized_summs[i] if token not in stopwords.words('english')]
    tokenized_no_sw.append(no_sw) 
tokenized_no_sw[0][:10]

['act',
 'france',
 'prisoners',
 'work',
 'hard',
 'labour',
 'work',
 'song',
 'years',
 'prison']

In [80]:
# Create a new column of tokenized summaries with no stopwords
df['tokenized_no_sw'] = tokenized_no_sw

In [86]:
# Explore Les Miserables vs. all synopses' similarities again
similarities_tksw = {}
lesmis_tksw = nlp(' '.join(df['tokenized_no_sw'][0]))
for i, summ in enumerate(df['tokenized_no_sw']):
    string = nlp(' '.join(summ))
    similarity = lesmis_tksw.similarity(string)
    similarities_tksw[df['name'][i]] = similarity

In [89]:
# Look at all similarities under .95
[print(key, value) for key, value in similarities_tksw.items() if value < .8];

Fosse 0.7839886959067401
Coco 0.790898964945354


Still the same two musicals with the lowest similarities. Let's see if the same musicals have the highest similarities.

Les Misérables 1.0
Spring Awakening 0.9887492997502907
Pippin 0.9873893923835498
Ragtime 0.9879187386125562
Carousel 0.987595750828035
Man of La Mancha 0.9872949756419365
Assassins 0.9877956099148062
A Funny Thing Happened on the Way to the Forum 0.9870503570441587
American Idiot 0.9877180639810607
A Gentleman's Guide to Love and Murder 0.9882046842230645
Aida 0.9883073948730837
The Scarlet Pimpernel 0.9872025528294902

In [94]:
# Look at top 5 most similar musicals to Les Miserables
[print(key, value) for key, value in similarities_tksw.items() if value > .92];

Les Misérables 1.0
Spring Awakening 0.933411306341679
Pippin 0.9230158656172904
Ragtime 0.9262608863317805
South Pacific 0.9249653063763591
Funny Girl 0.9200188753512265
Carousel 0.9269278724067395
Man of La Mancha 0.9212773122273784
A Little Night Music 0.9211707705593006
Assassins 0.925454081155838
A Funny Thing Happened on the Way to the Forum 0.921945917071289
The Pirates of Penzance 0.9262916518822016
American Idiot 0.9232707773437321
A Gentleman's Guide to Love and Murder 0.9282633370178898
Candide 0.9365248083161198
Aida 0.9315519171844839
The Light in the Piazza 0.9201185261115328
Passion 0.9241247874683887
Kiss of the Spider Woman 0.9208534590925479
Parade 0.9204888750890989
The Scarlet Pimpernel 0.9247059425897945


There is bit more spread in these similarities scores, which reflects perhaps a bit more of the differences between the musicals. There was also a bit of shuffling, and "American Idiot" was moved lower down the list, which seems appropriate, but I am still surprised to see it so high. Next, try removing all words that are not adjective, adverbs, verbs, or nouns with spaCy.

# Data Cleaning: Stop Words (spaCy)

In [None]:
# Code to base pulling important words out of the doc on
Loop over each token in the doc and check the token.pos_ attribute.
Use doc[token.i + 1] to check for the next token and its .pos_ attribute.
If a proper noun before a verb is found, print its token.text.

In [None]:
# Vectorize each doc w/ a word2vec (trained on that corpus from Matt I need to learn about)

# Word Mapping

In [None]:
# Do a bag of words FIRST to determine which words to map????????????

# Cluster Model: Alpha

#### CountVectorizer with KMeans Cluster Model

In [51]:
X = df['combined']

In [52]:
X_train, X_test = train_test_split(X, random_state=42)

In [53]:
# CountVectorizer
cvec = CountVectorizer(ngram_range=(1,2), max_features=500, stop_words='english')
cvec.fit(X)
X = cvec.transform(X)

In [54]:
X

<196x500 sparse matrix of type '<class 'numpy.int64'>'
	with 25750 stored elements in Compressed Sparse Row format>

In [55]:
km = KMeans()
X = km.fit_transform(X)

In [56]:
silhouette_score(X, km.labels_)

0.5822585544061012

In [57]:
df['labels'] = km.labels_

In [59]:
sorted(df['labels'].unique())

[0, 1, 2, 3, 4, 5, 6, 7]

In [63]:
df.loc[df['labels'] == 0, 'name'].count()

188

This is not a good model/transformer combination. Almost all musicals are in the same cluster.

#### TfidfVectorizer with KMeans Cluster Model

In [65]:
X = df['combined']

In [66]:
tvec = TfidfVectorizer(ngram_range=(1,2), max_features=500, stop_words='english')
tvec.fit(X)
X = tvec.transform(X)

In [67]:
km = KMeans()
X = km.fit_transform(X)

In [68]:
df['labels2'] = km.labels_

In [69]:
df.loc[df['labels2'] == 0, 'name'].count()

7

In [70]:
df.loc[df['labels2'] == 1, 'name'].count()

29

In [71]:
df.loc[df['labels2'] == 2, 'name'].count()

30

In [72]:
df.loc[df['labels2'] == 3, 'name'].count()

7

In [73]:
df.loc[df['labels2'] == 0, 'name']

25                 The Music Man
47                   Jersey Boys
75     Hedwig and the Angry Inch
109                    Brigadoon
111              The Who's Tommy
129            Shrek the Musical
172                    War Paint
Name: name, dtype: object

In [75]:
df.loc[df['labels2'] == 1, 'name']

2                                            Hamilton
6                                                Rent
9      Sweeney Todd: The Demon Barber of Fleet Street
14                                          Hairspray
19                                            Newsies
28                                           Avenue Q
29                                     In the Heights
36                                     Guys and Dolls
46                                      Hello, Dolly!
49                                               Hair
70                           Thoroughly Modern Millie
76                                     Come from Away
92                                     Porgy and Bess
98                                               1776
106                                              Once
110                                       On the Town
112                                     Be More Chill
115                               Catch Me If You Can
120                         

In [76]:
df.loc[df['labels2'] == 2, 'name']

0                      Les Misérables
7                       The Lion King
13                   Dear Evan Hansen
18                             Grease
30               Beauty and the Beast
31                     The King and I
40                        Miss Saigon
44             Gypsy: A Musical Fable
53                           Heathers
54                         Cinderella
58                           Spamalot
64                 The Little Mermaid
74                            Camelot
80     Sunday in the Park with George
87                            Aladdin
88                          Anastasia
90                    Victor/Victoria
107                   Jekyll and Hyde
113                           The Wiz
119                        Sister Act
121                           Candide
123              Once Upon a Mattress
125                            Frozen
169                          Dogfight
178                           Twisted
183                              Coco
186         

In [74]:
df.loc[df['labels2'] == 3, 'name']

39                          Kinky Boots
60                   Annie Get Your Gun
81     You're a Good Man, Charlie Brown
116                       Sweet Charity
132                         pajama game
147                              Parade
181                          Shenandoah
Name: name, dtype: object

In [77]:
df.loc[df['labels2'] == 4, 'name']

3                                 West Side Story
17                         Little Shop of Horrors
20                                   My Fair Lady
27                               Spring Awakening
37                                         Pippin
42                                 Next to Normal
45                                     Dreamgirls
51                                       Waitress
52                                        Ragtime
68                   School of Rock - the Musical
83                           A Little Night Music
85                                      Urinetown
86                               Sunset Boulevard
95     The 25th Annual Putnam County Spelling Bee
97                        The Pirates of Penzance
100                                  Damn Yankees
101                                American Idiot
108            Beautiful: the Carole King Musical
122                             Starlight Express
124                             A Christmas Carol


In [78]:
df.loc[df['labels2'] == 5, 'name']

4                                              Wicked
10                                 The Sound of Music
12                                     Into the Woods
23                                              Annie
26                                            Matilda
32                                       Mary Poppins
33       Joseph and the Amazing Technicolor Dreamcoat
35                                               Cats
48                                            Oliver!
56                                          Peter Pan
57                                      South Pacific
62                                   The Color Purple
89                                  The Secret Garden
94                            Chitty Chitty Bang Bang
96     A Funny Thing Happened on the Way to the Forum
105                                   The Fantasticks
114                                          Fun Home
118                   Seven Brides for Seven Brothers
126        Natasha Pierre an

In [304]:
X = df['combined']
X.head()

0    Act I In 1815 France, prisoners work at hard l...
1    Prologue In 1911 Paris, the Paris Opéra hosts ...
2    The musical details Hamilton's life in two act...
3    Act 1 Two rival teenage gangs, the Jets (White...
4    Act I In the Land of Oz, the Ozians are rejoic...
Name: combined, dtype: object

#### TfidfVectorizer with DBSCAN Model

In [305]:
tvec = TfidfVectorizer(ngram_range=(1,2), max_features=500, stop_words='english')
tvec.fit(X)
X = tvec.transform(X).toarray()

In [306]:
X.shape

(196, 500)

In [307]:
# ss = StandardScaler()
# X_scaled = ss.fit_transform(X)
# X_scaled.shape

In [308]:
db = DBSCAN(eps=1, min_samples=3)
db.fit(X);

In [309]:
db.components_

array([[0.        , 0.        , 0.        , ..., 0.01010462, 0.01148557,
        0.        ],
       [0.00639294, 0.00736011, 0.        , ..., 0.03321765, 0.04045431,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.02152187, 0.        ,
        0.        ]])

In [310]:
db.labels_

array([-1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0, -1,  0,
       -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,
       -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,
        1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,
        0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1, -1,  0,
        0, -1,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0, -1, -1,  0,  0,  0,
       -1,  0,  0,  0,  0, -1,  1,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0, -1, -1,  0,  0,
       -1,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0,
       -1, -1,  0, -1, -1, -1,  0, -1, -1])

In [311]:
df[df['labels3'] == -1].count()

name            176
plot_summary    176
synopsis        176
combined        176
labels          176
labels2         176
labels3         176
dtype: int64

In [312]:
df[df['labels3'] == 1].count()

name            4
plot_summary    4
synopsis        4
combined        4
labels          4
labels2         4
labels3         4
dtype: int64

In [313]:
df[df['labels3'] == 0].count()

name            4
plot_summary    4
synopsis        4
combined        4
labels          4
labels2         4
labels3         4
dtype: int64

In [285]:
df['labels3'] = db.labels_

In [286]:
df['labels3'].unique()

array([-1,  0,  1,  4,  2,  3,  5])

In [287]:
df[df['labels3'] == 1]

Unnamed: 0,name,plot_summary,synopsis,combined,labels,labels2,labels3
23,Annie,"Act 1 In 1933, eleven-year-old Annie is in the...",Act One Events take place during the Great De...,"Act 1 In 1933, eleven-year-old Annie is in the...",0,5,1
41,oklahoma,"Act I In Oklahoma Territory, in 1906, cowboy C...","1906, Oklahoma. Cowboy Curly was looking forwa...","Act I In Oklahoma Territory, in 1906, cowboy C...",0,7,1
60,Annie Get Your Gun,summary Act I When the traveling Buffalo Bill...,"Charlie Davenport, who is the organizer of the...",summary Act I When the traveling Buffalo Bill...,0,3,1
115,Catch Me If You Can,"Act I In the 1960s, Frank Abagnale Jr., a youn...",This work is a musical version of the movie wi...,"Act I In the 1960s, Frank Abagnale Jr., a youn...",0,1,1


In [314]:
silhouette_score(X_scaled, db.labels_)

-0.04716517540758456