In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import sklearn
import boto3
from s3 import get_file
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolor

# Import Data from Amazon S3 into DataFrame

In [2]:
# connect to Amazon S3
s3 = boto3.resource('s3')
lyrics = get_file(s3,'s3ssp', download_file='NLP_Data/new_master_lyrics_audio_features.csv',rename_file='nlp.csv')

In [3]:
# create a pandas dataframe and drop 'na'
df = pd.read_csv(lyrics,sep='|',encoding='utf-8')
df_demo = df.copy().dropna()

In [4]:
# print top words in a topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        modify_message = message.split(":")
        modify_message = modify_message[1]
        return pd.DataFrame(modify_message)
    print()

# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)

In [5]:
# Remove more stopwords identified in pyLDAvis
df_demo['new_lyrics'] = df_demo['lyrics'].apply(lambda x: x.replace("wan", "")
                                                .replace("chorus", "")
                                                .replace("verse", "")
                                                .replace("gon", ""))

In [6]:
#create samples
df_test_one = df_demo.sample(1000)
df_test_two = df_demo.sample(3000)
df_test_three = df_demo.sample(5000)

In [7]:
#CountVectorizer hyperparameters 
max_df = .5
min_df = .005 

#LDA hyperparameters
n_topics = 30
n_words=10

In [8]:
#Fit CountVectorizer on lyrics data
vectorizer = CountVectorizer(analyzer='word',
                             min_df=min_df, max_df=max_df,     
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}')  # num chars > 3  

data_vectorized = vectorizer.fit_transform(df_test_two['new_lyrics'])

# Begin Topic Modeling

In [9]:
# lda model
lda_model = LatentDirichletAllocation(n_components=n_topics, learning_method="online",
                                max_iter=35, random_state=0, doc_topic_prior=.02)
lda_model.fit(data_vectorized)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.02,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=35, mean_change_tol=0.001,
             n_components=30, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

# Visualize Topics Spatially

In [None]:
# visualize n_topics spatially
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')

In [None]:
print("\nTopics in LDA model:")
cv_feature_names = vectorizer.get_feature_names()
topics = print_top_words(lda_model, cv_feature_names, 30)
topics

In [None]:
topics = lda_model.show_topics(formatted=False)

# Calculate probabilities and find dominant topics for each document

In [None]:
# Create Document — Topic Matrix

lda_output = lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = [df_test_two['track_uri'].iloc[i] + str(i) for i in range(len(df_test_two))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Apply Style
df_document_topics = df_document_topic.sample(50).style.applymap(color_green).applymap(make_bold)
df_document_topics

In [None]:
# print documents by topic in order of probability
df_document_topic[['Topic24', 'dominant_topic']].sort_values(by='Topic24', ascending=False)