In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df = pd.read_csv("/kaggle/input/spotify-and-youtube/Spotify_Youtube.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
url_cols = ['Unnamed: 0', 'Url_spotify', 'Uri', 'Url_youtube', 'Title', 'Description']
df.drop(url_cols, axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
duplicates = df.duplicated()

print(df[duplicates])

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
album_type_count = df['Album_type'].value_counts()
print(album_type_count)

In [None]:
# Create a pie chart
labels = album_type_count.index.tolist()
sizes = album_type_count.values.tolist()
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)

plt.title('Album Types')
plt.legend(labels, loc='best')

plt.show()

In [None]:
# Group the songs by artist and views - youtube, stream - spotify
artist_grouped = df.groupby('Artist')[['Views', 'Stream']].sum()

# Sort the artists by the sum of views and streams in descending order
artist_sorted = artist_grouped.sort_values(['Views', 'Stream'], ascending=False)

# Get the top 10 artists with the most number of views on YouTube and streams on Spotify
top_10 = artist_sorted.head(10)

top_10

In [None]:
# Create two separate DataFrames for views and streams
df_views = df.groupby('Artist')['Views'].sum().sort_values(ascending=False)[:10]
df_streams = df.groupby('Artist')['Stream'].sum().sort_values(ascending=False)[:10]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))

# top 10 yt
ax1.set_title('Top 10 Artists on YouTube')
df_views.plot(kind='bar', ax=ax1)

# top 10 spotofy
ax2.set_title('Top 10 Artists on Spotify')
df_streams.plot(kind='bar', ax=ax2)


ax1.set_xlabel('Artist')
ax1.set_ylabel('Views')
ax2.set_xlabel('Artist')
ax2.set_ylabel('Streams')
fig.tight_layout()
plt.show()


In [None]:
# Top 10 songs based on views
top10_views = df.nlargest(10, 'Views')

# Top 10 songs based on comments
top10_comments = df.nlargest(10, 'Comments')

# Top 10 songs based on likes
top10_likes = df.nlargest(10, 'Likes')


In [None]:

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))

# Top 10 songs with most views
top10_views.plot(kind='barh', x='Track', y='Views', ax=axs[0])
axs[0].set_title('Top 10 songs with most views')

# Top 10 songs with most comments
top10_comments.plot(kind='barh', x='Track', y='Comments', ax=axs[1])
axs[1].set_title('Top 10 songs with most comments')

# Top 10 songs with most likes
top10_likes.plot(kind='barh', x='Track', y='Likes', ax=axs[2])
axs[2].set_title('Top 10 songs with most likes')

fig.tight_layout()


plt.show()


**I will update this notebook by analysing the songs further based on technical parameters.**

In [None]:
top_songs = df.sort_values('Stream', ascending=False).head(10)
top_songs[['Track', 'Energy', 'Danceability', 'Acousticness']]

In [None]:
top_songs_melt = top_songs.melt(id_vars=['Track'], value_vars=['Energy', 'Danceability', 'Acousticness'],
                                var_name='Attribute', value_name='Value')

sns.set_style('whitegrid')
sns.catplot(x='Track', y='Value', hue='Attribute', data=top_songs_melt, kind='bar',
            palette={'Energy': 'blue', 'Danceability': 'orange', 'Acousticness': 'green'},
            aspect=2, legend=False)


plt.legend(loc='upper right', bbox_to_anchor=(1.25, 1))
plt.title('Top 10 Songs by Stream')
plt.xlabel('Track')
plt.ylabel('Value')

plt.xticks(rotation=90)
plt.show()


Songs with dance beats and high energy are more popular

In [None]:
dup_row = df[df['Track'] == 'Sunflower - Spider-Man: Into the Spider-Verse']
dup_row

It looks that songs with multiple artists are not grouped together and they come up as duplicates. So, no need to clean it. We are running analysis on just the tracks. 

In [None]:
high_acousticness = 0.6

# filter the data for high and low acousticness
high_acousticness_group = df[df['Acousticness'] >= high_acousticness]
low_acousticness_group = df[df['Acousticness'] < high_acousticness]

# calculate the mean stream count for each group
high_acousticness_stream_mean = high_acousticness_group['Stream'].mean()
low_acousticness_stream_mean = low_acousticness_group['Stream'].mean()

# print the results
print('Mean stream count for artists with higher acousticness:', high_acousticness_stream_mean)
print('Mean stream count for artists with lower acousticness:', low_acousticness_stream_mean)


In [None]:
high_dance = 0.6

# filter the data for high and low dance
high_dance_group = df[df['Danceability'] >= high_dance]
low_dance_group = df[df['Danceability'] < high_dance]

# calculate the mean stream count for each group
high_dance_stream_mean = high_dance_group['Stream'].mean()
low_dance_stream_mean = low_dance_group['Stream'].mean()

# print the results
print('Mean stream count for artists with higher Danceability:', high_dance_stream_mean)
print('Mean stream count for artists with lower Danceability:', low_dance_stream_mean)