In [1]:
import numpy as np
import ast

from spotify import *

In [2]:
history = getSpotifyHistory()

# Getting minute and hour aggregates
history = history.assign(
    hrPlayed = (history['msPlayed'] / 3600000),
    mnPlayed = (history['msPlayed'] / 60000)
)

# Uploading raw history
history.to_csv('tableauData/rawSpotifyHistory.csv')

In [3]:
# Calling Spotify API to get details on tracks from history file
# tracks_df = None
# for index, row in history[['artistName', 'trackName']].drop_duplicates().iterrows():
#     raw_data = sp.search(
#             f'track:{row.trackName} artist:{row.artistName}',
#             type='track',
#             limit=1
#         )['tracks']
#     if not raw_data['items']:
#         continue
#     clean_data = cleanSpotifyData(raw_data)
#     try:
#         tracks_df = tracks_df.append(clean_data, ignore_index = True)
#     except AttributeError:
#         tracks_df = clean_data
        
# full_history = history.merge(
#     tracks_df,
#     'inner',
#     left_on = ['artistName', 'trackName'],
#     right_on = ['artists.name', 'name']
# )
# full_history.to_csv('data/fullSpotifyHistory.csv')

full_history = pd.read_csv('data/fullSpotifyHistory.csv')

full_history = full_history.drop(
    [
        'Unnamed: 0', 'artists.href', 'artists.name', 
        'artists.type', 'artists.uri', 'artists.external_urls.spotify',
        'artistName'
    ],
    axis=1
).rename(columns={'artists.id':'artist.id'})

In [4]:
track_ids = full_history.id.drop_duplicates().tolist()
artist_ids = full_history['artist.id'].drop_duplicates().to_list()

In [5]:
# artist_data = pd.DataFrame()
# for id in artist_ids:
#     raw_data = sp.artist(id)
#     clean_data = pd.json_normalize(raw_data, 'genres', 'id').rename(columns={0:'genres'})
#     # Appending the data to the DataFrame
#     artist_data = artist_data.append(clean_data)
# artist_data.to_csv('data/artistData.csv')

artist_data = pd.read_csv('data/artistData.csv')

artist_data['genres'] = artist_data.genres.apply(ast.literal_eval)
artist_data = artist_data.explode(
    'genres'
).drop(
    ['Unnamed: 0', 'href', 'popularity', 
    'type', 'uri', 'external_urls.spotify',
    'followers.href'],
    axis = 1
)

artist_data.to_csv('data/longArtistData.csv')
artist_data.genres.drop_duplicates().to_csv('data/allGenres.csv')

In [6]:
# Grouping the genres together and counting their popularity
genre_popularity = artist_data.groupby(
    'genres'
).count().drop(
    [
        'images', 'name', 'followers.total'
    ],
    axis = 1
).rename(columns={
    'id':'count',
    'genres':'genre'
}).sort_values('count', ascending=False)

# Initalizing necessary column to False
genre_popularity['necessary'] = [False for i in range(len(genre_popularity))]

# Counting number of genres per artist
artistGenreCounts = artist_data.index.value_counts()

# Filtering for any artist that only has 1 genre
necessary_genres = artistGenreCounts[artistGenreCounts == 1].index.values

# Geting series with the names of the unique genres
necessary_genres = artist_data[
    artist_data.index.isin(necessary_genres)
    ].genres.drop_duplicates()

# Adding True to any necessary genres that can't be deleted
genre_popularity.loc[genre_popularity.index.isin(necessary_genres), 'necessary'] = True

# Writing csv to look analyze
genre_popularity.to_csv('data/genreData.csv')

In [7]:
full_artist_data = artist_data.merge(
    genre_popularity,
    'left',
    left_on = 'genres',
    right_index = True
)

full_artist_data = full_artist_data[full_artist_data['genres'].notna()]
artistTopGenres = full_artist_data.groupby('id').max().drop(
    [
        'images', 'name', 'followers.total', 
        'necessary', 'count'
    ],
    axis = 1
)

artistTopGenres.reset_index(inplace=True)

clean_artist_data = full_artist_data.merge(
    artistTopGenres,
    'inner',
    left_on = ['id', 'genres'],
    right_on = ['id', 'genres'] 
).drop_duplicates().drop(
    [
        'images', 'followers.total', 'followers.total',
    ],
    axis=1
).rename(columns={
    'id':'artist.id',
    'name':'artist.name'
})

clean_artist_data.to_csv('data/cleanArtistData.csv')


In [8]:
# Merging tracks history with artist data
detailed_history = full_history.merge(
    clean_artist_data[['artist.id', 'genres', 'artist.name']],
    'inner'
).drop_duplicates()

detailed_history.to_csv('tableauData/detailedSpotifyHistory.csv')

In [9]:
# Finding top artist I listen to
groupedArtist = detailed_history.groupby(
    ['artist.id', 'artist.name', 'genres']
).aggregate(
    np.sum
).sort_values(
    'hrPlayed', 
    ascending=False
).head(50).drop(
    ['duration_ms', 'msPlayed', 'album.total_tracks'],
    axis=1
)

timesPlayed = detailed_history.groupby(
    ['artist.id', 'artist.name', 'genres']
).count().rename(columns={
    'id':'timesPlayed'
})['timesPlayed']

groupedArtist = groupedArtist.merge(
    timesPlayed,
    'inner',
    left_on = ['artist.id', 'artist.name', 'genres'],
    right_index = True
).drop(
    ['explicit'],
    axis=1
)


groupedArtist.to_csv('tableauData/topSpotifyArtist.csv')

In [10]:
# Finding the top songs I listen to
groupedTracks = detailed_history.groupby(
    ['trackName', 'artist.name', 'genres']
).aggregate(
    np.sum
).sort_values(
    'hrPlayed', 
    ascending=False
).head(60).drop(
    [
        'album.total_tracks'
    ],
    axis=1
)

timesPlayed = detailed_history.groupby(
    ['trackName', 'artist.name', 'genres']
).count().rename(columns={
    'id':'timesPlayed'
})['timesPlayed']

groupedTracks = groupedTracks.merge(
    timesPlayed,
    'inner',
    left_on = ['trackName', 'artist.name', 'genres'],
    right_index = True
).drop(
    ['explicit'],
    axis=1
)


groupedTracks.to_csv('tableauData/topSpotifyTracks.csv')

In [11]:
# Top Spofiy artist now
sp = spotify_authentication()
# Calling API
tracks = sp.current_user_top_tracks(limit=50, time_range='short_term')
# Cleaning data
df = cleanSpotifyData(tracks)

df = df.merge(
    clean_artist_data[['artist.id', 'genres', 'artist.name']],
    'inner'
)

df.to_csv('tableauData/topSpotifyTracksNow.csv')