# Capstone: Musical Recommender

Kelly Slatery | US-DSI-10

In [322]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import gensim
from gensim.models import Word2Vec
import time

import spacy

# Import Data

In [18]:
df = pd.read_csv('./data/musical_data.csv')
df.shape

(196, 3)

In [19]:
df.head()

Unnamed: 0,name,plot_summary,synopsis
0,Les Misérables,"Act I In 1815 France, prisoners work at hard...",The musical takes place at the beginning of th...
1,The Phantom of the Opera,"Prologue In 1911 Paris, the Paris Opéra host...",At the beginning of the XX century in the Pari...
2,Hamilton,The musical details Hamilton's life in two ac...,Can we call a thing more boring than the histo...
3,West Side Story,"Act 1 Two rival teenage gangs, the Jets (Whi...","The two gangs – Jets, consisting of white & Sh..."
4,Wicked,"Act I In the Land of Oz, the Ozians are rejo...",Good Witch Glinda of Oz country reported that ...


# Data Cleaning

In [20]:
# Strip all leading and trailing spaces
df['plot_summary'] = [summ.strip() for summ in df['plot_summary']]
df['synopsis'] = [synopsis.strip() for synopsis in df['synopsis']]
df.head()

Unnamed: 0,name,plot_summary,synopsis
0,Les Misérables,"Act I In 1815 France, prisoners work at hard l...",The musical takes place at the beginning of th...
1,The Phantom of the Opera,"Prologue In 1911 Paris, the Paris Opéra hosts ...",At the beginning of the XX century in the Pari...
2,Hamilton,The musical details Hamilton's life in two act...,Can we call a thing more boring than the histo...
3,West Side Story,"Act 1 Two rival teenage gangs, the Jets (White...","The two gangs – Jets, consisting of white & Sh..."
4,Wicked,"Act I In the Land of Oz, the Ozians are rejoic...",Good Witch Glinda of Oz country reported that ...


In [21]:
# Create a column combining both summaries/synopses
df['combined'] = df['plot_summary'] + df['synopsis']
df.head()

Unnamed: 0,name,plot_summary,synopsis,combined
0,Les Misérables,"Act I In 1815 France, prisoners work at hard l...",The musical takes place at the beginning of th...,"Act I In 1815 France, prisoners work at hard l..."
1,The Phantom of the Opera,"Prologue In 1911 Paris, the Paris Opéra hosts ...",At the beginning of the XX century in the Pari...,"Prologue In 1911 Paris, the Paris Opéra hosts ..."
2,Hamilton,The musical details Hamilton's life in two act...,Can we call a thing more boring than the histo...,The musical details Hamilton's life in two act...
3,West Side Story,"Act 1 Two rival teenage gangs, the Jets (White...","The two gangs – Jets, consisting of white & Sh...","Act 1 Two rival teenage gangs, the Jets (White..."
4,Wicked,"Act I In the Land of Oz, the Ozians are rejoic...",Good Witch Glinda of Oz country reported that ...,"Act I In the Land of Oz, the Ozians are rejoic..."


In [None]:
# Remove stop words
df['combined_no_sw'] = [summ.]

# Text Parsing

In [None]:
# Vectorize each doc w/ a word2vec (trained on that corpus from Matt I need to learn about)

In [317]:
# Credits: Code from GA 8.07-lesson-word-vectors by Matt Brems
# Start timer.
t0 = time.time()

# Import word vectors into "model"
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/kelly/Code/data_files/lexvec.enwiki+newscrawl.300d.W.pos.vectors')

# Print results of timer.
print(time.time() - t0)

86.7591941356659


In [318]:
model.most_similar('john')

[('william', 0.6612670421600342),
 ('robert', 0.6352747678756714),
 ('richard', 0.6107084155082703),
 ('peter', 0.6051682233810425),
 ('james', 0.5959731340408325),
 ('charles', 0.5940901041030884),
 ('henry', 0.5927909016609192),
 ('edward', 0.5730682611465454),
 ('paul', 0.5618348717689514),
 ('george', 0.5575616359710693)]

In [327]:
# !python -m spacy download en_vectors_web_lg

Collecting en_vectors_web_lg==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz (661.8 MB)
[K     |████████████████████████████████| 661.8 MB 12.2 MB/s eta 0:00:01     |██████████▏                     | 209.5 MB 8.9 MB/s eta 0:00:51
Building wheels for collected packages: en-vectors-web-lg
  Building wheel for en-vectors-web-lg (setup.py) ... [?25ldone
[?25h  Created wheel for en-vectors-web-lg: filename=en_vectors_web_lg-2.1.0-py3-none-any.whl size=663461747 sha256=5cd66ad62248bb51561d821e91ce5a28ce33e4c665b5bd4fd123b7c3379cd7c0
  Stored in directory: /private/var/folders/cj/k_q4ftbx1kx2ly31k5bth7880000gn/T/pip-ephem-wheel-cache-jtpufjxo/wheels/14/24/1b/61b40d65ed3ccd236ac714862d7ec3a999b99d495785563e09
Successfully built en-vectors-web-lg
Installing collected packages: en-vectors-web-lg
Successfully installed en-vectors-web-lg-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now l

In [337]:
import en_vectors_web_lg
nlp = en_vectors_web_lg.load()

In [338]:
# Credits: https://spacy.io/usage/vectors-similarity
first_summ = nlp(df['combined'][0])
second_summ = nlp(df['combined'][1])

In [339]:
first_summ.similarity(second_summ)

0.9832441745132257

# Word Mapping

In [None]:
# Do a bag of words FIRST to determine which words to map????????????

# Cluster Model: Alpha

#### CountVectorizer with KMeans Cluster Model

In [51]:
X = df['combined']

In [52]:
X_train, X_test = train_test_split(X, random_state=42)

In [53]:
# CountVectorizer
cvec = CountVectorizer(ngram_range=(1,2), max_features=500, stop_words='english')
cvec.fit(X)
X = cvec.transform(X)

In [54]:
X

<196x500 sparse matrix of type '<class 'numpy.int64'>'
	with 25750 stored elements in Compressed Sparse Row format>

In [55]:
km = KMeans()
X = km.fit_transform(X)

In [56]:
silhouette_score(X, km.labels_)

0.5822585544061012

In [57]:
df['labels'] = km.labels_

In [59]:
sorted(df['labels'].unique())

[0, 1, 2, 3, 4, 5, 6, 7]

In [63]:
df.loc[df['labels'] == 0, 'name'].count()

188

This is not a good model/transformer combination. Almost all musicals are in the same cluster.

#### TfidfVectorizer with KMeans Cluster Model

In [65]:
X = df['combined']

In [66]:
tvec = TfidfVectorizer(ngram_range=(1,2), max_features=500, stop_words='english')
tvec.fit(X)
X = tvec.transform(X)

In [67]:
km = KMeans()
X = km.fit_transform(X)

In [68]:
df['labels2'] = km.labels_

In [69]:
df.loc[df['labels2'] == 0, 'name'].count()

7

In [70]:
df.loc[df['labels2'] == 1, 'name'].count()

29

In [71]:
df.loc[df['labels2'] == 2, 'name'].count()

30

In [72]:
df.loc[df['labels2'] == 3, 'name'].count()

7

In [73]:
df.loc[df['labels2'] == 0, 'name']

25                 The Music Man
47                   Jersey Boys
75     Hedwig and the Angry Inch
109                    Brigadoon
111              The Who's Tommy
129            Shrek the Musical
172                    War Paint
Name: name, dtype: object

In [75]:
df.loc[df['labels2'] == 1, 'name']

2                                            Hamilton
6                                                Rent
9      Sweeney Todd: The Demon Barber of Fleet Street
14                                          Hairspray
19                                            Newsies
28                                           Avenue Q
29                                     In the Heights
36                                     Guys and Dolls
46                                      Hello, Dolly!
49                                               Hair
70                           Thoroughly Modern Millie
76                                     Come from Away
92                                     Porgy and Bess
98                                               1776
106                                              Once
110                                       On the Town
112                                     Be More Chill
115                               Catch Me If You Can
120                         

In [76]:
df.loc[df['labels2'] == 2, 'name']

0                      Les Misérables
7                       The Lion King
13                   Dear Evan Hansen
18                             Grease
30               Beauty and the Beast
31                     The King and I
40                        Miss Saigon
44             Gypsy: A Musical Fable
53                           Heathers
54                         Cinderella
58                           Spamalot
64                 The Little Mermaid
74                            Camelot
80     Sunday in the Park with George
87                            Aladdin
88                          Anastasia
90                    Victor/Victoria
107                   Jekyll and Hyde
113                           The Wiz
119                        Sister Act
121                           Candide
123              Once Upon a Mattress
125                            Frozen
169                          Dogfight
178                           Twisted
183                              Coco
186         

In [74]:
df.loc[df['labels2'] == 3, 'name']

39                          Kinky Boots
60                   Annie Get Your Gun
81     You're a Good Man, Charlie Brown
116                       Sweet Charity
132                         pajama game
147                              Parade
181                          Shenandoah
Name: name, dtype: object

In [77]:
df.loc[df['labels2'] == 4, 'name']

3                                 West Side Story
17                         Little Shop of Horrors
20                                   My Fair Lady
27                               Spring Awakening
37                                         Pippin
42                                 Next to Normal
45                                     Dreamgirls
51                                       Waitress
52                                        Ragtime
68                   School of Rock - the Musical
83                           A Little Night Music
85                                      Urinetown
86                               Sunset Boulevard
95     The 25th Annual Putnam County Spelling Bee
97                        The Pirates of Penzance
100                                  Damn Yankees
101                                American Idiot
108            Beautiful: the Carole King Musical
122                             Starlight Express
124                             A Christmas Carol


In [78]:
df.loc[df['labels2'] == 5, 'name']

4                                              Wicked
10                                 The Sound of Music
12                                     Into the Woods
23                                              Annie
26                                            Matilda
32                                       Mary Poppins
33       Joseph and the Amazing Technicolor Dreamcoat
35                                               Cats
48                                            Oliver!
56                                          Peter Pan
57                                      South Pacific
62                                   The Color Purple
89                                  The Secret Garden
94                            Chitty Chitty Bang Bang
96     A Funny Thing Happened on the Way to the Forum
105                                   The Fantasticks
114                                          Fun Home
118                   Seven Brides for Seven Brothers
126        Natasha Pierre an

In [304]:
X = df['combined']
X.head()

0    Act I In 1815 France, prisoners work at hard l...
1    Prologue In 1911 Paris, the Paris Opéra hosts ...
2    The musical details Hamilton's life in two act...
3    Act 1 Two rival teenage gangs, the Jets (White...
4    Act I In the Land of Oz, the Ozians are rejoic...
Name: combined, dtype: object

#### TfidfVectorizer with DBSCAN Model

In [305]:
tvec = TfidfVectorizer(ngram_range=(1,2), max_features=500, stop_words='english')
tvec.fit(X)
X = tvec.transform(X).toarray()

In [306]:
X.shape

(196, 500)

In [307]:
# ss = StandardScaler()
# X_scaled = ss.fit_transform(X)
# X_scaled.shape

In [308]:
db = DBSCAN(eps=1, min_samples=3)
db.fit(X);

In [309]:
db.components_

array([[0.        , 0.        , 0.        , ..., 0.01010462, 0.01148557,
        0.        ],
       [0.00639294, 0.00736011, 0.        , ..., 0.03321765, 0.04045431,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.02152187, 0.        ,
        0.        ]])

In [310]:
db.labels_

array([-1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0, -1,  0,
       -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,
       -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,
        1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,
        0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1, -1,  0,
        0, -1,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0, -1, -1,  0,  0,  0,
       -1,  0,  0,  0,  0, -1,  1,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0, -1, -1,  0,  0,
       -1,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0,
       -1, -1,  0, -1, -1, -1,  0, -1, -1])

In [311]:
df[df['labels3'] == -1].count()

name            176
plot_summary    176
synopsis        176
combined        176
labels          176
labels2         176
labels3         176
dtype: int64

In [312]:
df[df['labels3'] == 1].count()

name            4
plot_summary    4
synopsis        4
combined        4
labels          4
labels2         4
labels3         4
dtype: int64

In [313]:
df[df['labels3'] == 0].count()

name            4
plot_summary    4
synopsis        4
combined        4
labels          4
labels2         4
labels3         4
dtype: int64

In [285]:
df['labels3'] = db.labels_

In [286]:
df['labels3'].unique()

array([-1,  0,  1,  4,  2,  3,  5])

In [287]:
df[df['labels3'] == 1]

Unnamed: 0,name,plot_summary,synopsis,combined,labels,labels2,labels3
23,Annie,"Act 1 In 1933, eleven-year-old Annie is in the...",Act One Events take place during the Great De...,"Act 1 In 1933, eleven-year-old Annie is in the...",0,5,1
41,oklahoma,"Act I In Oklahoma Territory, in 1906, cowboy C...","1906, Oklahoma. Cowboy Curly was looking forwa...","Act I In Oklahoma Territory, in 1906, cowboy C...",0,7,1
60,Annie Get Your Gun,summary Act I When the traveling Buffalo Bill...,"Charlie Davenport, who is the organizer of the...",summary Act I When the traveling Buffalo Bill...,0,3,1
115,Catch Me If You Can,"Act I In the 1960s, Frank Abagnale Jr., a youn...",This work is a musical version of the movie wi...,"Act I In the 1960s, Frank Abagnale Jr., a youn...",0,1,1


In [314]:
silhouette_score(X_scaled, db.labels_)

-0.04716517540758456