# Imports and data preparation

In [2]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

In [5]:
!wget https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv

--2022-06-19 16:58:19--  https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7971379 (7.6M) [text/plain]
Saving to: ‘spotify_songs.csv’


2022-06-19 16:58:19 (87.6 MB/s) - ‘spotify_songs.csv’ saved [7971379/7971379]



In [6]:
songs = pd.read_csv('/content/spotify_songs.csv')

In [7]:
def shorter(string): # adding 'year' column
  string = str(string)[0:4]
  return string

In [8]:
songs['year'] = songs['track_album_release_date'].apply(shorter)

# General stats about dataset

In [9]:
basetable0 = songs.groupby(['playlist_genre'])['track_name'].count().reset_index()

In [None]:
basetable0

In [11]:
base0 = px.pie(basetable0, values='track_name', names='playlist_genre', color_discrete_sequence=px.colors.sequential.Bluyl, labels={'playlist_genre':'Genre', 'track_name':'Quantity'})
base0.update_layout(
    title="Distribution of tracks beteween genres",
    title_x=0.5,
    width = 700, height = 700,
    autosize = False)
base0.show()

In [16]:
basetable01 = songs.drop_duplicates(subset=['playlist_genre', 'playlist_subgenre'], keep='last')

In [17]:
base01 = px.histogram(basetable01, x='playlist_genre', color='playlist_subgenre', color_discrete_sequence=px.colors.sequential.Bluyl_r, labels={'playlist_subgenre':'Subgenre', 'playlist_genre':'Genre',
                                                                                                                                                'count':'Quantity'})
base01.update_xaxes(title_text='Genre', tickangle=315)
base01.update_yaxes(title_text='The number of subgenres', tickfont=dict(size=14))
base01.update_layout(title_text='Subgenres in each genre', title_x=0.5, title_y=0.95)
base01.show()

In [18]:
basetable1 = songs.drop_duplicates(subset=['track_artist', 'playlist_genre'], keep='last')
basetable1 = basetable1.groupby(['playlist_genre'])['track_artist'].count().reset_index()

In [19]:
base1 = px.pie(basetable1, values='track_artist', names='playlist_genre', color_discrete_sequence=px.colors.sequential.Bluyl, labels={'track_artist':'Quantity', 'playlist_genre':'Genre'})
base1.update_layout(
    title="Distribution of artists beteween genres",
    title_x=0.5,
    width = 700, height = 700,
    autosize = False)
base1.show()

In [20]:
basetable2 = songs.drop_duplicates(subset=['track_artist', 'year'], keep='last')
basetable2 = basetable2.groupby(['year'])['track_artist'].count().reset_index()

In [21]:
base2 = px.line(basetable2, x=basetable2['year'], y=basetable2['track_artist'], title="The number of artists per year", markers=True, 
                color_discrete_sequence=px.colors.sequential.Bluyl_r, labels={'track_artist':'Quantity', 'year':'Year'})
base2.update_xaxes(title_text='Year', tickangle=270)
base2.update_yaxes(title_text='The number of artists', tickfont=dict(size=14))
base2.update_layout(title_x=0.5, title_y=0.9)
base2.show()

In [22]:
df = songs[['danceability', 'speechiness', 'energy']]
violin1 = px.violin(df, box=True, # draw box plot inside the violin
                points='outliers', color_discrete_sequence=px.colors.sequential.Bluyl_r, labels={'value':'Value', 'variable':'Parameter'})
violin1.update_layout(title='Distribution of interesting parameters', violingap=0, violinmode='overlay', title_x=0.5, title_y=0.95)
violin1.show()

# Different stats

In [23]:
def tolist(pivot_columns): #list from indexes in pivot
  columns = []
  for i in pivot_columns:
    columns.append(i[1])
  return columns

In [24]:
svodnaya = songs.groupby(['track_artist', 'playlist_genre'])['track_popularity'].mean().reset_index()

In [25]:
svodnaya = svodnaya.pivot("track_artist", 'playlist_genre', "track_popularity")

In [26]:
svodnaya = svodnaya.fillna(0)

In [27]:
svodnaya['sum'] = svodnaya.sum(axis='columns')
svodnaya['nonzeros'] = svodnaya.astype(bool).sum(axis=1)-1
svodnaya['popularity'] = svodnaya['sum']/svodnaya['nonzeros']

In [28]:
svodnaya = svodnaya.sort_values(by='popularity', ascending=False)

In [29]:
populars = svodnaya.head(30).drop(columns=['sum', 'nonzeros', 'popularity'])

In [None]:
populars

In [31]:
fig4 = px.imshow(populars, x=populars.columns, y=populars.index, color_continuous_scale='bluyl')
fig4.update_layout(
    title="Popularity regarding genre. Top 30",
    title_x=0.5,
    xaxis_title="Genre",
    yaxis_title="Artist",
    coloraxis_colorbar=dict(
        title="Popularity"),
    width = 700, height = 580,
    autosize = False)
fig4.update_xaxes(tickangle=315)
fig4.show()
# fig4.show()

In [32]:
violin2 = px.violin(songs, x='playlist_genre', y='tempo', color='playlist_genre',  box=True, # draw box plot inside the violin
                points='outliers', color_discrete_sequence=px.colors.sequential.Bluyl_r, labels={'playlist_genre':'Genre', 'tempo':'Tempo'})
violin2.update_layout(title='Distribution of tempo for different genres', violingap=0, violinmode='overlay', title_x=0.5, title_y=0.95)
violin2.show()

In [33]:
table1 = songs.pivot_table(values=['track_popularity'], index=['playlist_genre'], columns=['year'])

In [34]:
fig5 = px.imshow(table1, x=tolist(table1.columns), y=table1.index, color_continuous_scale='bluyl')
fig5.update_layout(
    title="Popularity of genres during the given period",
    title_x=0.5,
    xaxis_title="Year",
    yaxis_title="Genre",
    coloraxis_colorbar=dict(
        title="Popularity"))
fig5.update_xaxes(tickangle=315)
fig5.update_layout(title_text='Popularity of genres during the given period', title_x=0.5, title_y=0.8)
fig5.show()


In [35]:
violin3 = px.violin(songs, x='playlist_genre', y='speechiness', color='playlist_genre',  box=True, # draw box plot inside the violin
                points='outliers', color_discrete_sequence=px.colors.sequential.Bluyl_r, labels={'playlist_genre':'Genre', 'speechiness':'Speechiness'})
violin3.update_layout(title='Distribution of speechines for different genres', violingap=0, violinmode='overlay', title_x=0.5, title_y=0.95)
violin3.show()

# Scatter plots

In [36]:
fig1 = px.scatter(songs, x="danceability", y="speechiness", color='playlist_genre', labels={
                     "danceability": "Danceability",
                     "speechiness": "Speechiness",
                     "playlist_genre": "Genre"
                 },
                title="Correlation between danceability and speechiness")
fig1.update_layout(title_x=0.5, title_y=0.9)
fig1.show()

In [38]:
fig2 = px.scatter(songs, x="danceability", y="energy", color='playlist_genre', labels={
                     "danceability": "Danceability",
                     "energy": "Energy",
                     "playlist_genre": "Genre"
                 },
                title="Correlation between danceability and energy")
fig2.update_layout(title_x=0.5, title_y=0.9)
fig2.show()

In [39]:
fig6 = px.scatter(songs, x="danceability", y="track_popularity", color='playlist_genre', labels={
                     "danceability": "Danceability",
                     "track_popularity": "Popularity",
                     "playlist_genre": "Genre"
                 },
                title="Correlation between danceability and popularity")
fig6.update_layout(title_x=0.5, title_y=0.9)
fig6.show()

In [40]:
fig7 = px.scatter(songs, x="danceability", y="instrumentalness", color='playlist_genre', labels={
                     "danceability": "Danceability",
                     "instrumentalness": "Insrumentalness",
                     "playlist_genre": "Genre"
                 },
                title="Correlation between instrumentalness and danceability")
fig7.update_layout(title_x=0.5, title_y=0.9)
fig7.show()

In [41]:
table = songs.pivot_table(values=['danceability'], index=['playlist_genre'], columns=['year'])

In [42]:
columns_fig3 = tolist(table.columns)

In [43]:
fig3 = px.imshow(table, x=columns_fig3, y=table.index, color_continuous_scale='bluyl', height=400)
fig3.update_xaxes(title_text='Year', tickangle=315)
fig3.update_yaxes(title_text='Genre')
fig3.update_layout(title_text='Dependecy between genre, year and danceability', title_x=0.5, title_y=0.8)
fig3.show()

# Last two plots

In [44]:
favourites = songs[(songs['track_artist'] == 'Def Leppard')]

In [45]:
favourites = favourites.sort_values(by='year', ascending=True)

In [49]:
violin4 = px.violin(favourites, x='year', y='track_popularity',  box=True, # draw box plot inside the violin
                points='outliers', color_discrete_sequence=px.colors.sequential.Bluyl_r, labels={'track_popularity':'Popularity', 'year':'Year'}, height=800, width=800)
violin4.update_layout(title='Popularity in different years', title_x=0.5, title_y=0.95)
violin4.show()

In [50]:
def remixer(track_name):#shows if wird 'remix' in the row
  string = 'Remix'
  remix = ''
  if string in str(track_name):
    #print(str(track_name))
    return True
  else:
    return False

In [51]:
songs['remix'] = songs['track_name'].apply(remixer)

In [53]:
df=songs

fig22 = go.Figure()

fig22.add_trace(go.Violin(x=df['playlist_genre'][ df['remix'] == True ],
                        y=df['track_popularity'][ df['remix'] == True ],
                        legendgroup='Remix', scalegroup='Remix', name='Remix',
                        side='negative',
                        line_color='#3685C7')
             )
fig22.add_trace(go.Violin(x=df['playlist_genre'][ df['remix'] == False ],
                        y=df['track_popularity'][ df['remix'] == False ],
                        legendgroup='Not Remix', scalegroup='Not Remix', name='Not Remix',
                        side='positive',
                        line_color='#49A123')
             )
fig22.update_traces(meanline_visible=True)
fig22.update_layout(title='Distribution of popularity for different versions', violingap=0, violinmode='overlay', title_x=0.5, title_y=0.9)
fig22.show()

In [281]:
with open('p_graph.html', 'w') as f:
    f.write(base0.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(base01.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(base1.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(base2.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(violin1.to_html(full_html=False, include_plotlyjs='cdn'))
    #раздел
    f.write(violin2.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig5.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig4.to_html(full_html=False, include_plotlyjs='cdn'))
    #раздел
    f.write(violin3.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig1.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig2.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig6.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig7.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig3.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(violin4.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig22.to_html(full_html=False, include_plotlyjs='cdn'))