# Lab 1 - Web Scraping Single Page

In [11]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pandas import json_normalize

In [3]:
#request and make soup
url = "https://www.popvortex.com/music/charts/top-100-songs.php"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

In [4]:
#the page structure is strait forward, two for loops should create two lists of songs and artists in the correct and matching order
songs = []
artists = []

for artist in soup.select('em.artist'):
    artists.append(artist.text)
    
for song in soup.select('cite.title'):
    songs.append(song.text)

In [5]:
#turn to pandas dataframe and do a quick check

top100songs = pd.DataFrame({"song":songs,"artist":artists})

top100songs

Unnamed: 0,song,artist
0,TEXAS HOLD 'EM,Beyoncé
1,Beautiful Messes,Hillary Scott & The Scott Family
2,Lose Control,Teddy Swims
3,Beautiful Things,Benson Boone
4,Flowers,Miley Cyrus
...,...,...
95,Leave a Light On,Papa Roach
96,...Ready For It?,Taylor Swift
97,The Painter,Cody Johnson
98,Scared To Start,Michael Marcagi


In [6]:
#just create another dataframe with the songs' place in another column
t100 = top100songs.copy()
t100['place'] = range(1,len(t100)+1)
t100

Unnamed: 0,song,artist,place
0,TEXAS HOLD 'EM,Beyoncé,1
1,Beautiful Messes,Hillary Scott & The Scott Family,2
2,Lose Control,Teddy Swims,3
3,Beautiful Things,Benson Boone,4
4,Flowers,Miley Cyrus,5
...,...,...,...
95,Leave a Light On,Papa Roach,96
96,...Ready For It?,Taylor Swift,97
97,The Painter,Cody Johnson,98
98,Scared To Start,Michael Marcagi,99


# Lab 2 - GNOD Part 2

In [7]:
t100.sample()['song'].iloc[0]

'HISS'

In [8]:
def arrogant_recommender(song):
    song_name = str(song).lower() #This makes all the input a lowercase string value, as far as I am concerned, this should not cause any problem and makes it convenient for comparison
    if song_name in [name.lower() for name in list(t100['song'])]:
        recommend = t100.sample()
        vox = 'You may listen to ' + recommend['song'].iloc[0] + ' by ' + recommend['artist'].iloc[0] + '.'
        print(vox)
        return vox
    else:
        vox = 'We accept hot songs only, get lost.'
        print(vox)
        return vox

In [9]:
arrogant_recommender('kek')
arrogant_recommender('GREEDY')

We accept hot songs only, get lost.
We accept hot songs only, get lost.


'We accept hot songs only, get lost.'

# Lab 3 - Big Collection of Songs & Audio Features

In [10]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
secrets_file = open("spotifyclientsecret.txt","r")
string = secrets_file.read()
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        #print(line.split(':'))
        secrets_dict[line.split(':')[0]]=line.split(':')[1].strip()
# establish connection to Spotify Web api
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['clientid'],
                                                           client_secret=secrets_dict['clientsecret']))

In [13]:
from random import randint
from time import sleep

def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,1000)/1000) # respectful nap
    return tracks

In [15]:
all_tracks = get_playlist_tracks("6yPiKpy7evrwvZodByKvM9")
len(all_tracks)

10000

In [16]:
tracks = json_normalize(all_tracks)
artists_df = pd.DataFrame(columns=['href', 'id', 'name', 'type', 'uri', 'external_urls.spotify','song_id', 'song_name', 'popularity' ])
for i in tracks.index:
    artists_for_song = json_normalize(tracks.iloc[i]['track.artists'])
    artists_for_song['song_id']    = tracks.iloc[i]['track.id']         # we want to keep song_id, it is the sae for all artists
    artists_for_song['song_name']  = tracks.iloc[i]['track.name']       # we want to keep song_name, it is the sae for all artists
    artists_for_song['popularity'] = tracks.iloc[i]['track.popularity'] # same for popularity   
    artists_df = pd.concat([artists_df, artists_for_song], axis=0)

df_final = artists_df[['song_name', 'name', 'song_id', 'popularity']].reset_index(drop=True)
df_final

Unnamed: 0,song_name,name,song_id,popularity
0,2K,Nosaj Thing,33xMbeHzmWd6Od0BmLZEUs,0
1,4 Billion Souls,The Doors,3UnyplmZaq547hwsfOR5yy,26
2,4 Minute Warning,Radiohead,1w8QCSDH4QobcQeT4uMKLm,0
3,7 Element,Vitas,7J9mBHG4J2eIfDAv5BehKA,0
4,#9 Dream,R.E.M.,1VZedwJj1gyi88WFRhfThb,6
...,...,...,...,...
11848,London Calling - Remastered,The Clash,5jzma6gCzYtKB1DbEwFZKH,75
11849,Low Rider,War,2fmMPJb5EzZCx8BcNJvVk4,0
11850,Flower,Moby,60rIdEPDrzyLiLC0icp3xz,0
11851,Brighter Than Gold,The Cat Empire,0sEm1ld0V8YTCPcjPVfIsc,47


In [17]:
#Audio feature:

In [21]:
df_final['song_id'].isnull().sum() #??????

82

In [22]:
df_final.dropna(inplace=True) # can't be bothered to investigate, just drop na

chunks = [(i, i+100) for i in range(0, len(df_final), 100)]

audio_features_list = []
for chunk in chunks:
    id_list100 = df_final['song_id'][chunk[0]:chunk[1]]
    audio_features_list = audio_features_list + sp.audio_features(id_list100)
    sleep(randint(1,1000)/1000)
len(audio_features_list)

11771

In [25]:
audio_features_df = pd.DataFrame(audio_features_list)
audio_features_df.drop_duplicates(inplace=True)

df_w_audio_ft = pd.merge(left=df_final,
                        right=audio_features_df,
                        how='inner',
                        left_on='song_id',
                        right_on='id')
df_w_audio_ft

Unnamed: 0,song_name,name,song_id,popularity,danceability,energy,key,loudness,mode,speechiness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,2K,Nosaj Thing,33xMbeHzmWd6Od0BmLZEUs,0,0.310,0.445,7,-13.355,0,0.0863,...,0.1130,0.122,95.360,audio_features,33xMbeHzmWd6Od0BmLZEUs,spotify:track:33xMbeHzmWd6Od0BmLZEUs,https://api.spotify.com/v1/tracks/33xMbeHzmWd6...,https://api.spotify.com/v1/audio-analysis/33xM...,152560,3
1,4 Billion Souls,The Doors,3UnyplmZaq547hwsfOR5yy,26,0.419,0.565,5,-11.565,1,0.0347,...,0.1280,0.648,151.277,audio_features,3UnyplmZaq547hwsfOR5yy,spotify:track:3UnyplmZaq547hwsfOR5yy,https://api.spotify.com/v1/tracks/3UnyplmZaq54...,https://api.spotify.com/v1/audio-analysis/3Uny...,197707,4
2,4 Minute Warning,Radiohead,1w8QCSDH4QobcQeT4uMKLm,0,0.354,0.302,9,-13.078,1,0.0326,...,0.1110,0.223,123.753,audio_features,1w8QCSDH4QobcQeT4uMKLm,spotify:track:1w8QCSDH4QobcQeT4uMKLm,https://api.spotify.com/v1/tracks/1w8QCSDH4Qob...,https://api.spotify.com/v1/audio-analysis/1w8Q...,244285,4
3,7 Element,Vitas,7J9mBHG4J2eIfDAv5BehKA,0,0.727,0.785,5,-6.707,0,0.0603,...,0.3100,0.960,129.649,audio_features,7J9mBHG4J2eIfDAv5BehKA,spotify:track:7J9mBHG4J2eIfDAv5BehKA,https://api.spotify.com/v1/tracks/7J9mBHG4J2eI...,https://api.spotify.com/v1/audio-analysis/7J9m...,249940,4
4,#9 Dream,R.E.M.,1VZedwJj1gyi88WFRhfThb,6,0.571,0.724,0,-5.967,1,0.0260,...,0.0919,0.385,116.755,audio_features,1VZedwJj1gyi88WFRhfThb,spotify:track:1VZedwJj1gyi88WFRhfThb,https://api.spotify.com/v1/tracks/1VZedwJj1gyi...,https://api.spotify.com/v1/audio-analysis/1VZe...,278320,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11766,London Calling - Remastered,The Clash,5jzma6gCzYtKB1DbEwFZKH,75,0.651,0.801,0,-7.340,1,0.0514,...,0.0825,0.776,133.769,audio_features,5jzma6gCzYtKB1DbEwFZKH,spotify:track:5jzma6gCzYtKB1DbEwFZKH,https://api.spotify.com/v1/tracks/5jzma6gCzYtK...,https://api.spotify.com/v1/audio-analysis/5jzm...,200480,4
11767,Low Rider,War,2fmMPJb5EzZCx8BcNJvVk4,0,0.811,0.647,0,-10.989,1,0.0498,...,0.0572,0.990,139.787,audio_features,2fmMPJb5EzZCx8BcNJvVk4,spotify:track:2fmMPJb5EzZCx8BcNJvVk4,https://api.spotify.com/v1/tracks/2fmMPJb5EzZC...,https://api.spotify.com/v1/audio-analysis/2fmM...,191560,4
11768,Flower,Moby,60rIdEPDrzyLiLC0icp3xz,0,0.686,0.610,7,-5.902,1,0.0262,...,0.0710,0.766,80.567,audio_features,60rIdEPDrzyLiLC0icp3xz,spotify:track:60rIdEPDrzyLiLC0icp3xz,https://api.spotify.com/v1/tracks/60rIdEPDrzyL...,https://api.spotify.com/v1/audio-analysis/60rI...,206293,4
11769,Brighter Than Gold,The Cat Empire,0sEm1ld0V8YTCPcjPVfIsc,47,0.711,0.718,6,-5.739,1,0.0380,...,0.0732,0.688,117.071,audio_features,0sEm1ld0V8YTCPcjPVfIsc,spotify:track:0sEm1ld0V8YTCPcjPVfIsc,https://api.spotify.com/v1/tracks/0sEm1ld0V8YT...,https://api.spotify.com/v1/audio-analysis/0sEm...,200293,4


In [26]:
#save to csv:
df_w_audio_ft.to_csv('curated_song_and_features')