In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import sklearn
import boto3
from s3 import get_file
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolor

# Import Data from Amazon S3 into DataFrame

In [2]:
# connect to Amazon S3
s3 = boto3.resource('s3')
lyrics = get_file(s3,'s3ssp', download_file='NLP_Data/new_master_lyrics_audio_features.csv',rename_file='nlp.csv')

In [3]:
# create a pandas dataframe and drop 'na'
df = pd.read_csv(lyrics,sep='|',encoding='utf-8')
df_demo = df.copy().dropna()

In [4]:
# print top words in a topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)

In [5]:
# Remove more stopwords identified in pyLDAvis
df_demo['new_lyrics'] = df_demo['lyrics'].apply(lambda x: x.replace("wan", "")
                                                .replace("chorus", "")
                                                .replace("verse", "")
                                                .replace("gon", ""))

In [6]:
#create samples
df_test_one = df_demo.sample(1000)
df_test_two = df_demo.sample(3000)
df_test_three = df_demo.sample(5000)

In [7]:
#CountVectorizer hyperparameters 
max_df = .5
min_df = .005 

#LDA hyperparameters
n_topics = 30
n_words=10

In [8]:
#Fit CountVectorizer on lyrics data
vectorizer = CountVectorizer(analyzer='word',
                             min_df=min_df, max_df=max_df,     
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}')  # num chars > 3  

data_vectorized = vectorizer.fit_transform(df_test_two['new_lyrics'])

# Begin Topic Modeling

In [9]:
# lda model
lda_model = LatentDirichletAllocation(n_components=n_topics, learning_method="online",
                                max_iter=35, random_state=0, doc_topic_prior=.01)
lda_model.fit(data_vectorized)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.01,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=35, mean_change_tol=0.001,
             n_components=30, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

# Visualize Topics Spatially

In [10]:
# visualize n_topics spatially
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [11]:
print("\nTopics in LDA model:")
cv_feature_names = vectorizer.get_feature_names()
print_top_words(lda_model, cv_feature_names, 30)


Topics in LDA model:
Topic #0: girl time talk bad sit guess friend watch road stay wrong pretty walk crazy day play reason care lot fine fun win leave woman eye remember read catch spend wear
Topic #1: fly settle apple snake leave stack ooh yeah singe rock glass worry touch silver roll ball sing hook plane guilty damn fuck dead teach feel king crowd stupid music life
Topic #2: night tonight dance ready rock party alive fight slow jump rhythm building wave floor revolution play thrill hit beat foot monkey hand house shut smile safe rescue spot time step
Topic #3: day life leave hard win bring late forget water wrong reach walk trouble fool climb hang meaning bell nightmare seek decide wise ready hold numb row boat shout ground meet
Topic #4: ooh hand breathe throw bang air shoot leave window woah play foot worth piece cross boom tall scared fake start truck game playin feel lean bare tired stand trigger finger
Topic #5: money shit nigga fuck bitch hit bout niggas baby light stick ass l

# Calculate probabilities and find dominant topics for each document

In [12]:
# Create Document — Topic Matrix

lda_output = lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = [df_test_two['track_uri'].iloc[i] + str(i) for i in range(len(df_test_two))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Apply Style
df_document_topics = df_document_topic.sample(50).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,dominant_topic
4Dr8B9i6gjLdrSgRvlPXS7980,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.16,0.06,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.03,0.0,0.0,0.03,0.23,0.12,0.0,0.03,0.0,0.0,0.04,23
46DFYMR7YxdJp5Q0EBkQkU244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.51,0.0,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
4vw2uB56uXifTBnUDSMg7M2793,0.14,0.0,0.0,0.0,0.03,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.0,0.0,0.0,0.1,0.0,0.0,0.03,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.04,13
4mZoSnOIowHkMuE1Rwp5ib2503,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.07,0.0,0.19,0.0,0.06,0.0,0.0,0.0,0.02,0.08,9
602IvuunA8AQh0Myh590hH2734,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.5,0.0,0.0,0.0,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.02,0.0,0.0,0.02,0.0,0.0,12
1tpuwDG1xSp8majndRFw2b1523,0.0,0.0,0.23,0.0,0.0,0.0,0.11,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.21,0.2,0.0,0.0,0.04,0.06,0.03,0.0,0.07,0.0,0.0,0.0,2
4UqNCygrRNbVgRyvITFRKv2786,0.0,0.0,0.0,0.09,0.0,0.0,0.11,0.0,0.03,0.26,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.31,29
4Wv7aG6kbHLbo2h218dChl978,0.0,0.0,0.08,0.08,0.0,0.33,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.09,0.0,0.0,0.03,0.0,0.0,0.05,0.0,0.0,0.24,0.05,0.0,0.0,5
7dSl3ckwDMJKSO9sfQPMz875,0.0,0.0,0.47,0.0,0.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,2
65wGpHrOOUQVqLSx4ZS34T870,0.17,0.02,0.0,0.02,0.0,0.0,0.08,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.23,0.1,0.2,0.0,0.0,0.0,0.0,0.04,0.06,0.0,0.0,0.0,0.0,0.0,16


In [13]:
# print documents by topic in order of probability
df_document_topic[['Topic1', 'dominant_topic']].sort_values(by='Topic1', ascending=False)

Unnamed: 0,Topic1,dominant_topic
7cg2YGhg1k6QoQdychuM3q1269,0.19,19
4fTg5dNKrfzuPs0oLku1Vl1755,0.19,5
659lYJRBHCq8cCKdxA2Z3f1409,0.15,9
4aUOG1Z1mDUajNQyIYUhfo2709,0.14,29
60tDcE1FnNDNgw0ykNYQQr58,0.14,4
7aokPpwjDPms1bKMWgKINz371,0.14,9
6kJeV9qTonZPGLijch4xzw2228,0.13,17
3TUzSZ8PZnLSiXMu5fsTNs783,0.13,6
2h02Nvz0YSL4usgTkgECVE161,0.12,6
5v2fnu64RUm3T4Ej17DeVk391,0.12,10
