In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.stats import entropy
import sys
import os
import plotly.express as px
sys.path.append(os.path.abspath("../src"))  # POINTS to src/

from spotify_analyzer.data import *
from spotify_analyzer.plotting import *


Get Data

In [2]:
df = get_all_time_ranges()
df = enrich_with_genres(df)

# Show dataframe layout
df.head()


Unnamed: 0,name,artist,artist_id,track_id,popularity,duration_ms,time_range,genres
0,Zenith,Bleed From Within,5ZvwJikDgdP1PFU4PkAPVG,1PjNgNIwuFzvJzrxMB2DBk,51,258786,short_term,"[metalcore, deathcore, metal, djent]"
1,Look To Windward,Sleep Token,2n2RSaZqBuUUukhbLlpnE6,4Lojbtk7XNMdSKRHSFbdkm,83,466463,short_term,"[progressive metal, metalcore]"
2,Valhalla,ERRA,2UoOdQyBGyzrEfxcY77ce0,2x6HfJ4seoYTQy4Ixg4bbm,31,296946,short_term,"[djent, metalcore, progressive metal, deathcor..."
3,Slow Sour Bleed,ERRA,2UoOdQyBGyzrEfxcY77ce0,0xWRZPboXOm0QHKDaYic1w,53,247573,short_term,"[djent, metalcore, progressive metal, deathcor..."
4,Cast Down,Bleed From Within,5ZvwJikDgdP1PFU4PkAPVG,6R7hcuvAVhKg1U301WHS9q,31,242720,short_term,"[metalcore, deathcore, metal, djent]"


### Listening Behavior

In [5]:
def get_top_genres(df,time_range, top_n=None):

    df = df.copy()
    df = df[df['time_range'] == time_range]
    all_genres = [genre for sublist in df['genres'] for genre in sublist]
    genre_counts = Counter(all_genres)
    items = genre_counts.most_common(top_n) if top_n else genre_counts.items()
    return pd.DataFrame(items, columns = ['genre','count'])

def get_most_repeated_artists(df,time_range,top_n=None):
    
    df = df.copy()
    df = df[df['time_range'] == time_range]
    artist_counts = df['artist'].value_counts()
    top_artists = artist_counts.head(top_n) if top_n else artist_counts

    return pd.DataFrame(top_artists.reset_index())

def get_avg_duration(df):
    df = df.copy()
    df['duration_min'] = df['duration_ms'] / 60000
    return df.groupby('time_range')['duration_min'].mean().reset_index()

### Time Range Comparison

### Listening Personality Profile

In [6]:
genres_df = get_top_genres(df, 'short_term', top_n=10)
artists_df = get_most_repeated_artists(df, 'short_term', top_n=10)
duration_df = get_avg_duration(df)

fig1 = plot_top_genres(genres_df, 'short_term')
fig2 = plot_top_artists(artists_df, 'short_term')
fig3 = plot_avg_duration(duration_df)

fig1.show()
fig2.show()
fig3.show()

In [7]:
def get_top_genres_over_time(df, top_n=10):
    genre_dfs = []
    for tr in ['short_term', 'medium_term', 'long_term']:
        gdf = get_top_genres(df, time_range=tr, top_n=None)
        gdf['time_range'] = tr
        genre_dfs.append(gdf)

    all_genres = pd.concat(genre_dfs)
    top_genres = all_genres.groupby('genre')['count'].sum().nlargest(top_n).index
    filtered = all_genres[all_genres['genre'].isin(top_genres)]
    return filtered

In [8]:
import plotly.express as px

def plot_genre_evolution(df):
    fig = px.bar(
        df,
        x='genre',
        y='count',
        color='time_range',
        barmode='group',
        title='Top Genre Frequency Across Time Ranges',
        template='plotly_white'
    )
    return fig


In [9]:
genre_df = get_top_genres_over_time(df, top_n=15)

plot_genre_evolution(genre_df).show()

