# Visualizations

## Imports

In [None]:
import pandas as pd
import altair as alt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler

## Preprocessing

In [None]:
by_isrc = pd.read_csv('data/checkpoint/by_isrc_oldest.csv')

In [None]:
by_isrc.head().T

### Find the biggest genres

In [None]:
remove_genres = ["classical", "jazz", "folk", "french", "turkish", "gospel", "samba", "piano", "mpb", "sertanejo", "pagode", "sleep", "forro", "malay", "anime", 
                 "j-idol", "comedy", "mandopop", "cantopop", "show-tunes", "emo", "romance", "j-dance", "chill", "world-music", "iranian", "idm", "metalcore", 
                 "hardstyle", "opera", "k-pop", "j-pop", "j-rock", "happy"]

In [None]:
df = pd.read_csv('data/DE/data-neu.csv')
df.drop_duplicates(inplace=True)

In [None]:
df.head(2).T

In [None]:
ct = pd.crosstab(df['isrc'], df['genres'])
# ct.reset_index(inplace=True)
ct = ct.applymap(lambda x: 1 if x > 1 else x)
ct.reset_index(inplace=True)

In [None]:
# drop genres of the DataFrame
ct = ct.set_index('isrc').drop(columns=remove_genres)

In [None]:
# find all songs which are not assigned to genre anymore and remove them
print(f'Shape before removing songs. ct: {ct.shape}; df: {df.shape}')
remove_songs = ct[ct.T.sum() == 0].index.to_list()
ct.drop(index=remove_songs, inplace=True)
df.set_index('isrc', inplace=True)
df.drop(index=remove_songs, inplace=True)
df.reset_index(inplace=True)
print(f'Shape after removing songs. ct: {ct.shape}; df: {df.shape}')

In [None]:
# count songs per genre
ct_sum = ct.sum()
ct_sum_df = pd.DataFrame(ct_sum)
ct_sum_df.rename(columns={0: 'count'}, inplace=True)
sorted_genres = ct_sum_df.sort_values(by='count', ascending=False).index.to_list()

In [None]:
ct_sum_df.sort_values(by='count', ascending=False)[:10]

Biggest genres is pop.

Count occurences of the top genres per year

In [None]:
by_isrc_year = by_isrc.copy()
by_isrc_year = by_isrc_year[['year', 'isrc']]
by_isrc_year = by_isrc_year.merge(ct, on=['isrc'], how='left')
by_isrc_year.set_index('year', inplace=True)
by_isrc_year = by_isrc_year[sorted_genres[:10]]
by_isrc_year = by_isrc_year.groupby('year').sum()
by_isrc_year.index = by_isrc_year.index.astype(str)

In [None]:
by_isrc_year

## Development of features over the years

In [None]:
# remove songs that have not the correct genres
df = by_isrc.copy()
print(df.shape)
df.set_index('isrc', inplace=True)
remove_help = []
df_index = df.index.to_list()
for song_isrc in remove_songs:
    if song_isrc in df_index:
        remove_help.append(song_isrc)
try:
    df.drop(index=remove_help, inplace=True)
except Exception as e:
    print(e)
    
df.reset_index(inplace=True)
print(df.shape)

In [None]:
# currently not used
save = df.copy()
df['year_month'] = 0
for index, song in df.iterrows():
    df.loc[index,'year_month'] = song['year']
    if song['release_date_precision'] == 'month':
        df.loc[index,'year_month'] += int(song['release_date'][5:7]) / 12


In [None]:
per_year_features = ['year', 'popularity', 'danceability', 'energy',
                     'key', 'loudness', 'mode', 'speechiness',
                     'acousticness', 'instrumentalness', 'liveness',
                     'valence', 'tempo', 'duration_ms',
                     'time_signature']
per_year_agg = {
    "popularity": 'mean',
    'danceability': 'mean', 
    'energy': 'mean',
    'key': 'mean', 
    'loudness': 'mean', 
    'mode': 'mean', 
    'speechiness': 'mean',
    'acousticness': 'mean', 
    'instrumentalness': 'mean', 
    'liveness': 'mean',
    'valence': 'mean', 
    'tempo': 'mean', 
    'duration_ms': 'mean',
    'time_signature': 'mean',
    'count': 'count'
}

songs_per_year = df.copy()
songs_per_year = songs_per_year[per_year_features]
songs_per_year['count'] = -1
songs_per_year = songs_per_year.groupby("year").agg(per_year_agg)
songs_per_year.reset_index(inplace=True)
songs_per_year['duration_min'] = songs_per_year['duration_ms'] / (1000 * 60)
songs_per_year = songs_per_year.astype({'year': str})

In [None]:
features = ["danceability", "energy", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence"]
fig = go.Figure()

for feature in features:
    fig.add_trace(go.Scatter(
    x=songs_per_year['year'],
    y=songs_per_year[feature],
    name=feature
))

fig.update_layout(
    height = 1200,
    width = 2000,
    #yaxis_title='Popularity',
    xaxis_title='year',
    title='Development of features',
    template='plotly_dark',
    yaxis_range=[0,1]
)

fig.show()

In [None]:
features = [
    {
        "title": "Tempo (bpm)",
        "value": "tempo",
        "range": [110, 130]
    },
    {
        "title": "Duration (min)",
        "value": "duration_min",
        "range": [3, 5]
    },
    {
        "title": "Loudnes (dB)",
        "value": "loudness",
        "range": [0, -20]
    },
    {
        "title": "Songs released",
        "value": "count",
        "range": [0, 50000]
    }
]

In [None]:
for feature in features:
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=songs_per_year['year'],
        y=songs_per_year[feature["value"]],
        name=feature["value"]
    ))
    
    fig.update_layout(
        height = 1200,
        width = 2000,
        yaxis_title=feature["title"],
        xaxis_title='year',
        title='Development of features',
        template='plotly_dark',
        yaxis_range=feature["range"]
    )
    
    fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=songs_per_year['year'],
    y=songs_per_year['popularity'],
    name='Duration (ms)'
))

#fig.add_trace(go.Scatter(
#    x=df['year_month'],
#    y=df['duration_ms'],
#    mode='markers',
#    name='Duration (ms) - markers'
#))

# Define the dropdown menu options
dropdown_options = [
    #{'label': 'Duration (ms)', 'value': 'duration_ms', 'range': []},
    {'label': 'Popularity', 'value': 'popularity', 'range': [0,10]},
    {'label': 'Danceability', 'value': 'danceability', 'range': [0,1]},
    {'label': 'Energy', 'value': 'energy', 'range': [0,1]},
    #{'label': 'Key', 'value': 'key', range: [0,1]},
    {'label': 'Loudness', 'value': 'loudness', 'range': [0,-20]},
    {'label': 'Mode', 'value': 'mode', 'range': [0,1]},
    {'label': 'Speechiness', 'value': 'speechiness', 'range': [0,1]},
    {'label': 'Acousticness', 'value': 'acousticness', 'range': [0,1]},
    {'label': 'Instrumentalness', 'value': 'instrumentalness', 'range': [0,1]},
    {'label': 'Liveness', 'value': 'liveness', 'range': [0,1]},
    {'label': 'Valence', 'value': 'valence', 'range': [0,1]},
    {'label': 'Tempo', 'value': 'tempo', 'range': [100, 130]},
    {'label': 'Time Signature', 'value': 'time_signature', 'range': [3,7]},
    {'label': 'Count', 'value': 'count', 'range': [0,50000]},
    {'label': 'Duration (min)', 'value': 'duration_min', 'range': [3,5]}
]

# Create the dropdown menu
dropdown_menu = go.layout.Updatemenu(
    buttons=list([
        dict(
            args=[
                {'y': [songs_per_year[option['value']]]}, 
                {'yaxis': {
                    'title': option['label'].capitalize(),
                    'range': option['range']
                }}
            ],
            label=option['label'],
            method='update'
        )
        for option in dropdown_options
    ]),
    direction='down',
    showactive=True,
)

fig.update_layout(
    height = 800,
    updatemenus=[dropdown_menu],
    yaxis_title='Popularity',
    xaxis_title='year',
    title='Development of features',
    template='plotly_dark',
    yaxis_range=[0,10]
)

fig.show()

## Development of genres of the years

In [None]:
features = sorted_genres[:10]

In [None]:
dev_genres_df = by_isrc_year.reset_index()

In [None]:
fig = go.Figure()

for feature in features:
    fig.add_trace(
        go.Scatter(
            x=dev_genres_df['year'],
            y=dev_genres_df[feature],
            name=feature
        )
    )
    
fig.update_layout(
    height = 1200,
    #updatemenus=[dropdown_menu],
    yaxis_title='count',
    xaxis_title='year',
    title='Development of genres',
    template='plotly_dark',
    yaxis_rangemode='tozero'
)

fig.show()

In [None]:
# Create the initial scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=dev_genres_df['year'],
    y=dev_genres_df['pop'],
    #mode='markers',
    name='Pop'
))

# Create the dropdown menu
dropdown_menu = go.layout.Updatemenu(
    buttons=list([
        dict(
            args=[
                {'y': [dev_genres_df[option]]}, 
                {'yaxis': {
                    'title': option.capitalize(),
                    'rangemode':'tozero'
                }}
            ],
            label=option,
            method='update'
        )
        for option in features
    ]),
    direction='down',
    showactive=True,
)

# Update the layout with dropdown menu and initial y-axis title
fig.update_layout(
    height = 800,
    updatemenus=[dropdown_menu],
    yaxis_title='pop',
    xaxis_title='year',
    title='Development of genres',
    template='plotly_dark',
    yaxis_rangemode='tozero'
)

fig.show()

## Add the genres to the features

In [None]:
df.head()

In [None]:
scaler = MinMaxScaler()
by_isrc_scaled = by_isrc.copy()
by_isrc_scaled.set_index('isrc', inplace=True)
by_isrc_scaled = by_isrc_scaled[['danceability', 'energy',
                       'key', 'loudness', 'mode', 'speechiness',
                       'acousticness', 'instrumentalness', 'liveness',
                       'valence', 'time_signature', 'tempo']]
by_isrc_scaled = pd.DataFrame(scaler.fit_transform(by_isrc_scaled), columns=by_isrc_scaled.columns, index = by_isrc_scaled.index)
by_isrc_scaled.reset_index(inplace=True)

In [None]:
by_isrc_genres = ct.merge(by_isrc_scaled, on=['isrc'], how='right')

In [None]:
by_isrc_genres.head()

## Create Radar Chart for the top 10 genres

In [None]:
sorted_genres[:10]

In [None]:
fig = go.Figure()
for genre in sorted_genres[:10]:
    df_copy = by_isrc_genres.copy()
    df_copy = df_copy[df_copy[genre] == 1]
    df_copy = df_copy[['danceability', 'energy',
                       'key', 'loudness', 'mode', 'speechiness',
                       'acousticness', 'instrumentalness', 'liveness',
                       'valence', 'time_signature', 'tempo', 'danceability']]
    df_radar = pd.DataFrame(df_copy.mean(), columns=['mean'])
    df_radar.reset_index(inplace=True)
    df_radar.rename(columns={"index":"feature"}, inplace=True)

    fig.add_trace(go.Scatterpolar(
        r = df_radar['mean'],
        theta = df_radar['feature'],
        mode = 'lines',
        fill = 'none',
        name = genre
    ))

In [None]:
fig.update_layout(
    height = 1200,
    template='plotly_dark'
)
fig.show()

In [None]:
df_radar

In [None]:
#!jupyter nbconvert --to slides viz-jannis.ipynb

## Word Cloud for Genres

In [None]:
df.explode('genres')

In [None]:
df_split_genres

In [None]:
genre_counts = df['genres'].value_counts().reset_index()

genre_counts.columns = ['genres', 'count']
genre_counts