In [1]:
import pandas as pd
import numpy as np
import re
import nltk

In [2]:
songs = pd.read_csv('/Users/yanzihao/Downloads/finaldataset.csv')

# Song Title Preprocessing

In [3]:
song_title = songs['title']

In [4]:
list_rows = []
for index, row in song_title.iteritems():
    # split into words by white space
    words = row.split()
    
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    
    # convert to lower case
    stripped = [word.lower() for word in stripped]
    
    # Filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english')) 
    removed_stop_words = [w for w in stripped if not w in stop_words]
    
    # remove remaining tokens that are not alphabetic
    words = [word for word in removed_stop_words if word.isalpha()]
    
    # Stemming of words
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    list_rows.append(stemmed)

In [5]:
songs['clean_title'] = list_rows
songs

Unnamed: 0,duration,end_of_fade_in,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,...,artist_longitude,artist_name,album_name,album_id,song_hotness_num,song_id,title,year,song_hotness,clean_title
0,138.97098,0.000,7,0.418,-2.060,1,0.566,138.971,177.768,4,...,-4.24251,Hudson Mohawke,Butter,625706,0.617871,SOGTUKN12AB017F4F1,No One Could Ever,2006,1.0,"[one, could, ever]"
1,196.02240,0.000,8,0.195,-6.366,1,0.146,185.202,189.346,7,...,-75.92381,Dying Fetus,Descend Into Depravity,610151,0.614766,SOKOVRQ12A8C142811,Ethos of Coercion,2009,1.0,"[etho, coercion]"
2,217.57342,0.514,3,0.770,-4.867,0,0.417,214.309,88.423,4,...,,Emery,I'm Only A Man (Bonus Track Version),143873,0.717319,SOIMMJJ12AF72AD643,Rock-N-Rule,2007,2.0,[rocknrul]
3,155.19302,0.084,10,0.510,-13.588,1,0.586,146.332,67.118,5,...,0.48883,Frank Chacksfield,Classic Years,238993,0.460485,SOYKVON12A8C14097E,Cockleshell Heroes,2007,1.0,"[cockleshel, hero]"
4,181.81179,7.982,1,0.000,-6.844,0,0.559,176.385,119.919,4,...,-84.19444,Hawthorne Heights,Fragile Future,316457,0.358977,SOCPTIN12A8C14265F,Disaster [Demo Version],2008,1.0,"[disast, demo, version]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111751,266.63138,0.194,1,0.293,-4.077,1,0.000,264.191,120.119,4,...,12.56935,Hatesphere,Serpent Smiles And Killer Eyes,492330,0.501117,SOFGQKB12AB017D895,Feeding The Demons,2007,1.0,"[feed, demon]"
111752,180.66240,0.177,10,0.125,-5.159,0,0.339,168.107,87.997,4,...,,Stealing O'Neal,Collidescope,537458,0.481694,SOPCSBF12AB017B6ED,Chin Up Baby,2008,1.0,"[chin, babi]"
111753,195.97016,0.000,10,0.054,-4.211,0,0.432,195.970,206.169,4,...,,Captain Planet,Inselwissen,596766,0.642327,SOIQVTG12AC468E9C0,Parkhaus,2009,1.0,[parkhau]
111754,88.45016,5.608,9,0.051,-17.182,0,0.445,88.450,93.804,3,...,,Coronatus,Lux Noctis,208957,0.450762,SOIQNVK12A8C13B061,Interrotte Speranze,2007,1.0,"[interrott, speranz]"


# Topic Modeling - Latent Dirichlet Allocation (LDA)

In [6]:
# Create a function to pull out nouns or adj from a string of text
from nltk import word_tokenize, pos_tag

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [9]:
songs['clean_title']=songs['clean_title'].apply(str)

In [10]:
# Apply the nouns_adj function to the transcripts to filter only on nouns
data_nouns_adj = songs['clean_title'].apply(nouns_adj)
data_nouns_adj

0                                  ]
1                [ 'etho 'coercion ]
2                      [ 'rocknrul ]
3                    [ 'cockleshel ]
4         [ 'disast 'demo 'version ]
                     ...            
111751                             ]
111752                             ]
111753                  [ 'parkhau ]
111754       [ 'interrott 'speranz ]
111755                      [ 'way ]
Name: clean_title, Length: 111756, dtype: object

In [11]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

In [12]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said','new',
                  'say','says','said','first','second','will','year','years','now','one','may','just']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [13]:
ldana = models.LdaModel(corpus=corpusna, num_topics=10, id2word=id2wordna, passes=200)
ldana.print_topics()

[(0,
  '0.074*"la" + 0.066*"come" + 0.059*"man" + 0.058*"blue" + 0.022*"run" + 0.020*"tell" + 0.019*"na" + 0.018*"turn" + 0.018*"beauti" + 0.016*"bad"'),
 (1,
  '0.028*"intro" + 0.025*"danc" + 0.025*"angel" + 0.019*"word" + 0.017*"oh" + 0.016*"death" + 0.015*"end" + 0.011*"dirti" + 0.011*"togeth" + 0.010*"es"'),
 (2,
  '0.044*"good" + 0.034*"night" + 0.026*"citi" + 0.020*"sweet" + 0.017*"da" + 0.012*"forev" + 0.012*"young" + 0.012*"room" + 0.010*"del" + 0.010*"set"'),
 (3,
  '0.029*"edit" + 0.028*"radio" + 0.025*"hand" + 0.025*"star" + 0.025*"make" + 0.019*"lie" + 0.018*"happi" + 0.016*"place" + 0.016*"babi" + 0.013*"great"'),
 (4,
  '0.272*"version" + 0.022*"die" + 0.022*"explicit" + 0.015*"featur" + 0.015*"eye" + 0.011*"lp" + 0.011*"life" + 0.010*"stand" + 0.010*"noth" + 0.010*"du"'),
 (5,
  '0.075*"feat" + 0.029*"away" + 0.029*"le" + 0.025*"black" + 0.023*"girl" + 0.019*"big" + 0.018*"hous" + 0.015*"god" + 0.015*"high" + 0.012*"alway"'),
 (6,
  '0.040*"home" + 0.021*"face" + 0.021*"

# Visualizing the Models with pyLDAvis

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
lda_tf = LatentDirichletAllocation(n_components=20,random_state=0)
lda_tf.fit(data_cvna)

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, data_cvna, cvna)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
