## Visualizations

In [None]:
import pandas as pd
import altair as alt
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn.preprocessing import MinMaxScaler
import wordcloud
import matplotlib
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import clear_output
from sklearn import preprocessing
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
by_isrc = pd.read_csv('by_isrc_oldest.csv')

In [None]:
by_isrc.head().T

In [None]:
remove_genres = ["classical", "jazz", "folk", "french", "turkish", "gospel", "samba", "piano", "mpb", "sertanejo", "pagode", "sleep", "forro", "malay", "anime", 
                 "j-idol", "comedy", "mandopop", "cantopop", "show-tunes", "emo", "romance", "j-dance", "chill", "world-music", "iranian", "idm", "metalcore", 
                 "hardstyle", "opera", "k-pop", "j-pop", "j-rock", "happy"]

## Find the biggest genres

In [None]:
df = pd.read_csv('data.csv')
df.drop_duplicates(inplace=True)

In [None]:
df.head(5).T

In [None]:
ct = pd.crosstab(df['isrc'], df['genres'])
# ct.reset_index(inplace=True)
ct = ct.applymap(lambda x: 1 if x > 1 else x)
ct.reset_index(inplace=True)

In [None]:
# drop genres of the DataFrame
ct = ct.set_index('isrc').drop(columns=remove_genres)

In [None]:
# find all songs which are not assigned to genre anymore and remove them
print(f'Shape before removing songs. ct: {ct.shape}; df: {df.shape}')
remove_songs = ct[ct.T.sum() == 0].index.to_list()
ct.drop(index=remove_songs, inplace=True)
df.set_index('isrc', inplace=True)
df.drop(index=remove_songs, inplace=True)
df.reset_index(inplace=True)
print(f'Shape after removing songs. ct: {ct.shape}; df: {df.shape}')

In [None]:
ct_sum = ct.sum()

In [None]:
ct_sum_df = pd.DataFrame(ct_sum)

In [None]:
#ct_sum_df.drop(index='isrc', inplace=True)

In [None]:
ct_sum_df.rename(columns={0: 'count'}, inplace=True)

In [None]:
sorted_genres = ct_sum_df.sort_values(by='count', ascending=False).index.to_list()

In [None]:
ct_sum_df.sort_values(by='count', ascending=False)[0:50]

Biggest genres is pop.

Count occurences of the top genres per year

In [None]:
by_isrc_year = by_isrc.copy()
by_isrc_year = by_isrc_year[['year', 'isrc']]
by_isrc_year = by_isrc_year.merge(ct, on=['isrc'], how='left')
by_isrc_year.set_index('year', inplace=True)

genres_by_year = by_isrc_year[sorted_genres[:50]] #used for wordcloud
genres_by_year = genres_by_year.groupby('year').sum()
genres_by_year.index = genres_by_year.index.astype(str)

by_isrc_year = by_isrc_year[sorted_genres[:10]]
by_isrc_year = by_isrc_year.groupby('year').sum()
by_isrc_year.index = by_isrc_year.index.astype(str)

## Development of features over the years

In [None]:
df = by_isrc.copy()

In [None]:
df.head(2).T

In [None]:
songs_per_year = df.copy()
songs_per_year = songs_per_year[['year', 'popularity', 'danceability', 'energy',
                                 'key', 'loudness', 'mode', 'speechiness',
                                 'acousticness', 'instrumentalness', 'liveness',
                                 'valence', 'tempo', 'duration_ms',
                                 'time_signature']]
songs_per_year['count'] = -1
songs_per_year = songs_per_year.groupby("year").agg({
    "popularity": 'mean',
    'danceability': 'mean', 
    'energy': 'mean',
    'key': 'mean', 
    'loudness': 'mean', 
    'mode': 'mean', 
    'speechiness': 'mean',
    'acousticness': 'mean', 
    'instrumentalness': 'mean', 
    'liveness': 'mean',
    'valence': 'mean', 
    'tempo': 'mean', 
    'duration_ms': 'mean',
    'time_signature': 'mean',
    'count': 'count'
})
songs_per_year.reset_index(inplace=True)

In [None]:
songs_per_year['duration_min'] = songs_per_year['duration_ms'] / (1000 * 60)

In [None]:
songs_per_year.head(10)

In [None]:
songs_per_year.columns

In [None]:
songs_per_year = songs_per_year.astype({'year': str})

In [None]:
"""alt.Chart(songs_per_year).mark_bar().encode(
    alt.X("year"),
    alt.Y("count")
)"""

In [None]:
features = ['popularity', 'danceability', 'energy',
            'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness',
            'valence', 'tempo', 'duration_min',
            'time_signature']

In [None]:
songs_per_year.head()

In [None]:
"""# Create the initial scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=songs_per_year['year'],
    y=songs_per_year['duration_ms'],
    #mode='markers',
    name='Duration (ms)'
))

# Define the dropdown menu options
dropdown_options = [
    {'label': 'Duration (ms)', 'value': 'duration_ms'},
    {'label': 'Popularity', 'value': 'popularity'},
    {'label': 'Danceability', 'value': 'danceability'},
    {'label': 'Energy', 'value': 'energy'},
    {'label': 'Key', 'value': 'key'},
    {'label': 'Loudness', 'value': 'loudness'},
    {'label': 'Mode', 'value': 'mode'},
    {'label': 'Speechiness', 'value': 'speechiness'},
    {'label': 'Acousticness', 'value': 'acousticness'},
    {'label': 'Instrumentalness', 'value': 'instrumentalness'},
    {'label': 'Liveness', 'value': 'liveness'},
    {'label': 'Valence', 'value': 'valence'},
    {'label': 'Tempo', 'value': 'tempo'},
    {'label': 'Time Signature', 'value': 'time_signature'},
    {'label': 'Count', 'value': 'count'},
    {'label': 'Duration (min)', 'value': 'duration_min'}
]

# Create the dropdown menu
dropdown_menu = go.layout.Updatemenu(
    buttons=list([
        dict(
            args=[
                {'y': [songs_per_year[option['value']]]}, 
                {'yaxis': {'title': option['label'].capitalize()}}
            ],
            label=option['label'],
            method='update'
        )
        for option in dropdown_options
    ]),
    direction='down',
    showactive=True,
)

# Update the layout with dropdown menu and initial y-axis title
fig.update_layout(
    height = 800,
    updatemenus=[dropdown_menu],
    yaxis_title='Duration (ms)',
    xaxis_title='year',
    title='Development of features',
    showlegend=True
)

fig.show()"""

In [None]:
songs_per_year_genres = songs_per_year.merge(by_isrc_year, on='year', how='right')

#Create the initial scatter plot for genre 'pop'
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=songs_per_year_genres['year'],
    y=songs_per_year_genres['duration_ms'],
    #mode='line',
    name='pop',
    visible=True
))

# Add separate scatter plots for each genre
genres = ['rock', 'classical', 'german', 'jazz', 'folk', 'punk', 'metal', 'country', 'french']

for genre in genres:
    fig.add_trace(go.Scatter(
        x=songs_per_year_genres['year'],
        y=songs_per_year_genres['duration_ms'],
        #mode='line',
        name=genre,
        visible=False
    ))

# Create the dropdown menu for y-axis selection
dropdown_options = ['duration_ms', 'popularity', 'danceability', 'energy', 'key', 'loudness',
                    'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                    'valence', 'tempo', 'duration_min']

dropdown_menu = go.layout.Updatemenu(
    buttons=list([
        dict(
            args=[{'y': [songs_per_year_genres[option]]}],
            #label=option.capitalize(),
            method='update'
        )
        for option in dropdown_options
    ]),
    direction='down',
    showactive=True,
)

# Create the legend to toggle genre visibility
legend = dict(
    title='Genre',
    orientation='h',
    y=1.1,
    x=0.5,
    xanchor='center',
    bgcolor='rgba(255, 255, 255, 0.5)'
)

# Update the layout with dropdown menu, legend, and initial y-axis title
fig.update_layout(
    height = 800,
    updatemenus=[dropdown_menu],
    legend=legend,
    yaxis_title='Duration (ms)',
    title='Song Duration by Genre',
    xaxis_title='Year',
    showlegend=True
)

# Add custom buttons to toggle genre visibility in the legend
for i, genre in enumerate(genres):
    button = dict(
        label=genre.capitalize(),
        method='update',
        args=[{'visible': [True if j == i+1 else False for j in range(len(genres)+1)]}]
    )
    fig.update_traces(showlegend=True if genre == 'pop' else False, selector=dict(name=genre))
    fig.update_layout({'updatemenus': [{'buttons': [button]}]})

#fig.show()

## Development of genres of the years

In [None]:
genres_by_year

In [None]:
genres_by_year = genres_by_year.reset_index()

In [None]:
column_list = genres_by_year.columns.tolist()
id_vars_list = column_list[:1]
print(id_vars_list)

In [None]:
df_new = pd.melt(genres_by_year, id_vars=id_vars_list, 
             value_name="count")
df_new=df_new.rename(columns={"variable": "genres"})
df_new

### Word Cloud for Genres with frequency

In [None]:
genres_count = ct_sum_df.sort_values(by='count', ascending=False)
genres_count = genres_count.to_dict()

cloud = wordcloud.WordCloud(width=800, height=400, background_color='black',colormap='Set2').generate_from_frequencies(genres_count['count'])
plt.figure(figsize=(10, 5))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
genres_by_year = genres_by_year.set_index('year')
genres_by_year = genres_by_year.to_dict('index')

In [None]:
def wc(df):
    clear_output(wait=True)
    cloud = wordcloud.WordCloud(width=700, height=400, background_color='black').generate_from_frequencies(df)
    plt.figure(figsize=(9, 9))
    plt.imshow(cloud)
    plt.tight_layout()
    plt.axis("off")

### Top 50 Genres per Year in Word Cloud

In [None]:
dropdown = widgets.Dropdown(options = genres_by_year,
                                value=genres_by_year['1980'],
                                 description= "year" )

widgets.interact(wc,df=dropdown)

### Top 10 Genres per Year in Line Chart

In [None]:
features = sorted_genres[:10]

In [None]:
dev_genres_df = by_isrc_year.reset_index()

In [None]:
dev_genres_df

In [None]:
"""# Create the initial scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=dev_genres_df['year'],
    y=dev_genres_df['pop'],
    #mode='markers',
    name='Pop'
))

# Create the dropdown menu
dropdown_menu = go.layout.Updatemenu(
    buttons=list([
        dict(
            args=[
                {'y': [dev_genres_df[option]]}, 
                {'yaxis': {'title': option.capitalize()}}
            ],
            label=option,
            method='update'
        )
        for option in features
    ]),
    direction='down',
    showactive=True,
)

# Update the layout with dropdown menu and initial y-axis title
fig.update_layout(
    height = 800,
    updatemenus=[dropdown_menu],
    yaxis_title='pop',
    xaxis_title='year',
    title='Development of features'
)

fig.show()"""

## Add the genres to the features

In [None]:
df.head()

In [None]:
scaler = MinMaxScaler()
by_isrc_scaled = by_isrc.copy()
by_isrc_scaled.set_index('isrc', inplace=True)
by_isrc_scaled = by_isrc_scaled[['danceability', 'energy',
                       'key', 'loudness', 'mode', 'speechiness',
                       'acousticness', 'instrumentalness', 'liveness',
                       'valence', 'time_signature', 'tempo']]
by_isrc_scaled = pd.DataFrame(scaler.fit_transform(by_isrc_scaled), columns=by_isrc_scaled.columns, index = by_isrc_scaled.index)
by_isrc_scaled.reset_index(inplace=True)

In [None]:
by_isrc_genres = ct.merge(by_isrc_scaled, on=['isrc'], how='right')

In [None]:
by_isrc_genres.head()

## Create Radar Chart for the top 10 genres

In [None]:
"""sorted_genres[:10]"""

In [None]:
"""fig = go.Figure()
for genre in sorted_genres[:10]:
    df_copy = by_isrc_genres.copy()
    df_copy = df_copy[df_copy[genre] == 1]
    df_copy = df_copy[['danceability', 'energy',
                       'key', 'loudness', 'mode', 'speechiness',
                       'acousticness', 'instrumentalness', 'liveness',
                       'valence', 'time_signature', 'tempo', 'danceability']]
    df_radar = pd.DataFrame(df_copy.mean(), columns=['mean'])
    df_radar.reset_index(inplace=True)
    df_radar.rename(columns={"index":"feature"}, inplace=True)

    fig.add_trace(go.Scatterpolar(
        r = df_radar['mean'],
        theta = df_radar['feature'],
        mode = 'lines',
        fill = 'none',
        name = genre
    ))"""

In [None]:
"""fig.update_layout(
    height = 800
)
fig.show()"""

#### Popularity of Artists in Word Cloud

In [None]:
df.head()

In [None]:
from ast import literal_eval

df['genres'] = df['genres'].apply(literal_eval)
df_split_genres = df.explode('genres')

In [None]:
df_split_genres = df_split_genres[~df_split_genres['genres'].isin(remove_genres)]
df_split_genres['release_date'] = df_split_genres['release_date'].apply(lambda x: x.split('-')[0])

split artists

In [None]:
df_split_genres["artists"].unique()

In [None]:
df_split_genres['artists'] = df_split_genres['artists'].str.split(',')
df_split_artists = df_split_genres.explode('artists')

In [None]:
df_split_artists["artists"].unique()

In [None]:
df_artist_popularity_per_year = (
    df_split_artists.groupby(['year', 'artists'])
    .agg({'popularity': 'sum'})
    .groupby('year', as_index=False)
    .apply(lambda group: group.nlargest(10, columns='popularity'))
    .reset_index(level=0, drop=True)
    .reset_index()
    .pivot(index='year', columns='artists', values='popularity')
    .fillna(0)
    .astype(float)
    .to_dict(orient='index')
)

In [None]:
dropdown = widgets.Dropdown(options = df_artist_popularity_per_year,
                                value=df_artist_popularity_per_year[1980],
                                 description= "year" )

widgets.interact(wc,df=dropdown)

Popularity per Artists in Word Cloud

In [None]:
def wordcloud_artists(counts):
    cloud = wordcloud.WordCloud(width=800, height=400, background_color='black',colormap=matplotlib.cm.inferno).generate_from_frequencies(counts)
    plt.figure(figsize=(18,15))
    plt.imshow(cloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
artists_popularity = df_split_artists.drop_duplicates(["name","release_date"])

In [None]:
artists_popularity = artists_popularity.groupby("artists", as_index=False)["popularity"].sum()

In [None]:
artists_popularity

In [None]:
artist_popularity_tuples = [tuple(x) for x in artists_popularity.values]
wordcloud_artists(dict(artist_popularity_tuples))

### Chart Score

In [None]:
df_chart_power = by_isrc[by_isrc['chart_power'].notna()]
df_chart_power

average of chart score per artists

In [None]:
avg_chart = df_chart_power.groupby("artists")["chart_power"].sum().reset_index()
avg_chart

Chart Score per Artists in Word Cloud

In [None]:
avg_chart_tuples = [tuple(x) for x in avg_chart.values]
wordcloud_artists(dict(avg_chart_tuples))

Merge data with popularity and chart score

In [None]:
merge = artists_popularity.merge(avg_chart, on='artists',how="outer")
merge.sort_values('popularity',ascending=False)

In [None]:
merge.rename({"chart_power":"chart_score"},axis=1, inplace=True)
merge

In [None]:
merge.sort_values(['chart_score'],ascending=False).head(20)

Top 20 popular Artists from Spotify vs. from Chart Score

In [None]:
top20_popu = merge.nlargest(20, 'popularity').reset_index(drop=True)
top20_score = merge.nlargest(20, 'chart_score').reset_index(drop=True)

In [None]:
top20_popu

In [None]:
top20_score

In [None]:
fig = sp.make_subplots(rows=1, cols=2, subplot_titles=('Top 20 Popularity Artists', 'Top 20 Chart Score Artists'))

# Add bar trace for top popularity artists
fig.add_trace(go.Bar(x=top20_popu['popularity'], y=top20_popu['artists'],
                     orientation='h', name='Popularity Artists'), row=1, col=1)

# Add bar trace for top chart score artists
fig.add_trace(go.Bar(x=top20_score['chart_score'], y=top20_score['artists'],
                     orientation='h', name='Chart Score Artists'), row=1, col=2)

# Update layout
fig.update_layout(#title='Top 20 Artists',
                  yaxis=dict(autorange="reversed"),
                  yaxis2=dict(autorange="reversed"),
                    width=1600,  # Width in pixels
                    height=600)

# Show the plot
fig.show()

scale values of popularity and chart score

In [None]:
scaler = preprocessing.MinMaxScaler()
merge[['popularity', 'chart_score']] = scaler.fit_transform(merge[['popularity', 'chart_score']])

In [None]:
#merge=merge.dropna()

In [None]:
"""artists_filtered_1 = merge[(merge['chart_score'] > 0.5) & (merge['popularity'] < 0.5)]
artists_filtered_1"""

Top10 Artists who had low chart score but has now high popularity

In [None]:
def barplot(col1,col2,title):

    fig = go.Figure(data=go.Bar(
    x=col1,
    y=col2,
    orientation='h'
    ))

    fig.update_layout(
        title = title,
        xaxis=dict(
            range=[0.1, max(col1)+0.1],  # Set the range of the x-axis
            dtick=0.2  # Set the tick interval of the x-axis
        )
    )

    # Show the bar chart
    fig.show()

In [None]:
merge['difference_1'] = merge['popularity'].sub(merge['chart_score'])
top_diff_1= merge.nlargest(10, 'difference_1')
top_diff_1

In [None]:
barplot(top_diff_1['difference_1'],top_diff_1['artists'],"Top10 Artists who had low chart score but has now high popularity")

Top10 Artists who had high chart score but has now low popularity

In [None]:
merge['difference_2'] = merge['chart_score'].sub(merge['popularity'])
top_diff_2 = merge.nlargest(10, 'difference_2')
top_diff_2

In [None]:
barplot(top_diff_2['difference_2'],top_diff_2['artists'],"Top10 Artists who had high chart score but has now low popularity")