In [89]:
from pymongo import MongoClient
from rauth import OAuth1Service
import billboard
import lyricsgenius
from datetime import timedelta, datetime
from dateutil import rrule
import re
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
##Custom Modules I Created
from Billboard_Data import store_billboard_data
from Lyric_Data import *
from proc_modeling import *

# Initialize Mongo Databases

In [19]:
client = MongoClient("mongodb://**mongodb**)
song_db=client["Song_Database"]
top100 = song_db["Billboard_100_Col"]
song_lookup = song_db["songId_Lookup_Col"]
song_lyrics_col = song_db['Genius_Lyrics_Col']

# Collect Billboard Data 

In [20]:
for dt in rrule.rrule(rrule.WEEKLY, dtstart=datetime(1960,1,1), until=datetime.now()):
    ## Repeating this step twice because sometimes we hit a disconnect error the first time.
    try: 
        weekly_chart=billboard.ChartData('hot-100',dt.strftime('%Y-%m-%d'))
        store_billboard_data(song_lookup,top100,weekly_chart,dt)
        print(dt.strftime('%Y-%m-%d'))
    except requests.exceptions.Timeout:
        weekly_chart=billboard.ChartData('hot-100',dt.strftime('%Y-%m-%d'))
        store_billboard_data(song_lookup,top100,weekly_chart,dt)
        print(dt.strftime('%Y-%m-%d'))

1960-01-01
1960-01-08
1960-01-15
1960-01-22
1960-01-29


KeyboardInterrupt: 

# Collect Song Lyrics

## Initialize Genius Api

In [21]:
genius = lyricsgenius.Genius("**ClientKey**")
genius.remove_section_headers = True
genius.verbose = False

## Add Song Lyrics To Lyrics Database

In [23]:
count=0
no_song_count=0
for song in song_lookup.find({},{'title','artist'}):
    ##Check that we are adding new songs to the lyrics database
    if u_lyric_exists(song_lyrics_col,song['_id'])!=None:
        count+=1
        continue
    title,artist=reformat_song_artist(song['title'],song['artist'])
    try:
        song_lyrics=genius.search_song(title,artist)
        if song_lyrics==None:
            no_song_count+=1
            continue
        song_title,song_artist=reformat_song_artist(song_lyrics.title,song_lyrics.artist)
        if song_title!=title:
            if artist!=song_artist:
                no_song_count+=1
                continue

        count=count+1
        # Add our genius result to the lyrics database.
        
        add_song_to_lyric_db(song_lyrics,song_artist,song['_id'],song_lyrics_col)
        if count%100==0:
            print(f"Total Songs {count+starting_count}")
            print(f"Ratio Of Songs Not Read: {no_song_count/count}")
    #Handles potential errors for our script.
    except AttributeError or TypeError:
        print(f"{artist}:{title}")
        print(song['_id'])
        break

KeyboardInterrupt: 

# Lyric Processing

**Import Spacy And Load Spacy Packages**

In [24]:
import spacy
nlp=spacy.load('en', disable=['ner'])
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
#web_emb_nlp = spacy.load('en_vectors_web_lg')

**Lyric Cleaning and Lemmatizing**

In [61]:
lyric_list=[]
count=0
song_id_clean_lyrics_dict={}
for i in song_lyrics_col.find({},{'_id':0,'lyrics':1,'song_id':1}):
    temp_string=' '.join(str(word_processing(word)) for word in i['lyrics'].split())
    lem_doc=nlp(temp_string)
    temp_string=' '.join(str(token.lemma_) for token in lem_doc if re.match(r"\w+",token.lemma_)!=None)
    lyric_list.append(temp_string)
    #Create Song Lyrics Index
    song_id_clean_lyrics_dict[i['song_id']]=count
    song_id_clean_lyrics_dict[i['song_id']+'rank']=count
    if count%500==0:
        print(count)
    count+=1

0


# Topic Modeling

## Update Stop Words List

In [32]:
spacy_stopwords.update({'yeah','oh','ooh','hey','aingt','na','la','da','uh','nah','ohh','ooo',
                        'huh','ha','ayy','yo','yes','ah','doo','oo','ooh'})

## Count Vectorize Lyric List

In [33]:
count_vector=CountVectorizer(stop_words=spacy_stopwords,min_df=.005,max_df=.90
                             ,token_pattern="\\b[a-z][a-z]+\\b")

In [34]:
doc_word_cv=count_vector.fit_transform(lyric_list)

## LSA Modeling - Truncated SVD

In [35]:
lsa=TruncatedSVD(50)
doc_topic_cv = lsa.fit_transform(doc_word_cv)

In [36]:
display_topics(lsa,count_vector.get_feature_names(),40,5)


Topic  0
love, man, dog, hound, wanna, little, come, talk, day, white, dove, walk, run, baby, heart, eye, bear, kiss, number, know, initial, time, blue, dear, find, heartache, let, big, river, wear, life, feleena, like, boy, wait, tell, die, young, kind, danny

Topic  1
dog, hound, man, wanna, time, kind, miss, thing, life, slow, tear, pass, old, eye, morning, till, tough, trail, lead, loose, mighty, hunt, home, turn, rise, sun, look, right, meet, year, girl, think, stay, pretty, day, way, like, young, wait, kiss

Topic  2
initial, wear, loop, tood, great, big, ee, belong, ev, hop, joy, shop, slee, ridin, soda, wish, jeep, ryone, eep, gym, record, pride, day, movie, swim, know, boy, write, peak, afterbeat, everlasting, tcha, engraving, schroeder, rywhere, don, befit, chain, official, symbol

Topic  3
ave, ri, ma, ia, little, place, town, village, forget, st, lourde, bernadette, travel, beautiful, tell, like, far, night, happy, home, land, prayer, whisper, grotto, gaze, feeling, knee, 

In [54]:
word_cloud=create_topic_word_cloud(lsa,count_vector.get_feature_names(),40,5)


Topic  0

Topic  1

Topic  2

Topic  3

Topic  4


In [56]:
full_topic_list=[]
for topic in word_cloud:
    for word in topic:
        full_topic_list.append(word)

# Set Up Data For Visualization

## Create List Of Topic Vector Magnitude By Songs By Week
**Also will pull in the top 3 songs for each topic (by vector magnitude**

In [85]:
### Create the two lists to compare largest vector maginuted and song title
top_topic_score=[]
top_topic_song_id=[]
for i in range(0,5):
    zero_temp_list=[]
    str_temp_list=[]
    for x in range(0,3):
        zero_temp_list.append(0)
        str_temp_list.append('')
    top_topic_score.append(zero_temp_list)
    top_topic_song_id.append(str_temp_list)

In [84]:
plot_topic_over_time_list=[]
count=0
for document in top100.find({},{'Rank_List','date'}):
    temp_list=[]
    for song in document['Rank_List']:
        try:
            song_topic_vector_ind=song_id_clean_lyrics_dict[str(song['songId'])+'rank']
        except KeyError:
            continue
        temp_list.append(np.abs(lsa.transform(doc_word_cv[song_topic_vector_ind])[0][0:5]))
    if temp_list==[]:
        continue
    ## Get the weekly average of topic distribution for all songs in the weekly ranking.
    date_temp_list=list(np.average(temp_list,axis=0)/(np.sum(np.average(temp_list,axis=0))))
    date_temp_list.append(document['date'])
    plot_topic_over_time_list.append(date_temp_list)
    count+=1
    if(count%500==0):
        print(count)
    ## Check if any song in has a top 3 vector magnitude for any topics
    for ix,i in enumerate(temp_list[0][0:5]):
        for rnk_ix,rank_score in enumerate(top_topic_score[ix]):
            if ix==1:
                if song['songId']=='8daae54a498e483da9b91dae3b9b5c8b' or song['songId']=='0c6ef6f572ef4a38ba28429858728eb9' or song['songId']=='21ab5c540fb2429c9f2fbf4c111374c2':
                    break
            if i>rank_score:
                try:top_topic_score[ix][rnk_ix+1]=top_topic_score[ix][rnk_ix]
                except IndexError: pass
                try:top_topic_song_id[ix][rnk_ix+1]=top_topic_song_id[ix][rnk_ix]
                except IndexError: pass
                top_topic_score[ix][rnk_ix]=i
                top_topic_song_id[ix][rnk_ix]=song['songId']
                break

In [88]:
for ix,i in enumerate(top_topic_song_id):
    print(f"-----------------Topic {ix}---------------------")
    for rnk_ix,rank_song in enumerate(i):
        song_dict=song_lookup.find({'_id':rank_song},{'title','artist'})[0]
        print(song_dict["title"]+":"+song_dict['artist'])

-----------------Topic 0---------------------


IndexError: no such item for Cursor instance