In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/top-spotify-songs-2023/spotify-2023.csv', encoding='latin-1')

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
#Numerical Features
df.describe().T

In [None]:
#Categorical Features
df.describe(include=['O']).T

# Feature Engineering

columns: ```streams, in_deezer_playlists, in_shazam_charts``` should be a numeric feature but shows up as Categorical.
    Exploring this further:

In [None]:
df['is_non_numeric'] = df.in_deezer_playlists.str.isnumeric()
df.loc[df.is_non_numeric == False]['in_deezer_playlists'].head(10)

In [None]:
df.in_deezer_playlists = df.in_deezer_playlists.replace(',','', regex=True)
df.loc[df.is_non_numeric == False]['in_deezer_playlists'].head(10)

In [None]:
df.in_deezer_playlists = pd.to_numeric(df.in_deezer_playlists)

In [None]:
df.describe(include=['O']).T

In [None]:
df['is_non_numeric'] = df.in_shazam_charts.str.isnumeric()
df.loc[df.is_non_numeric == False]['in_shazam_charts'].head(10)

In [None]:
df.in_shazam_charts = df.in_shazam_charts.replace(',','', regex=True)

In [None]:
df.loc[df.is_non_numeric == False]['in_shazam_charts'].head(10)

In [None]:
df.in_shazam_charts = pd.to_numeric(df.in_shazam_charts)

#### Seems like the commas were the problem for ```in_shazam_charts``` and ```in_deezer_playlists``` and now have been removed and the columns have been converted.

In [None]:
df['is_non_numeric'] = df.streams.str.isnumeric()
df.loc[df.is_non_numeric == False]

In [None]:
df = df.drop(df[df.streams.str.contains('BPM')].index)
df.streams = pd.to_numeric(df.streams)

In [None]:
df.drop('is_non_numeric', axis=1, inplace=True)

# Nulls

In [None]:
print('Columns with Nulls : ', df.columns[df.isnull().any()].tolist())

In [None]:
df.fillna(value=0, axis=0,inplace=True)
df.reset_index(drop=True,inplace=True)

In [None]:
df.shape

In [None]:
df.isnull().sum()

# Unique Vals


In [None]:
# Calculate the total number of rows and unique values for each column
total_rows = df.shape[0]
unique_counts = df.nunique()

# Print results
header_line = ' ' + '-' * 34
print(f'\nThere are a total of {total_rows} rows in the dataframe\n')
print('The following are the number of unique rows in each column')
print(header_line)
print(f'|{"Column Name":^22s}|{"Count":^11s}|')
print(header_line)

for col, count in unique_counts.items():
    print(f'|{col:<22s}|{count:4} of {total_rows}|')

print(header_line)


In [None]:
df['mode']

Mode seems to be a binary value, we can replace 'Major' with 1 and 'Minor' with 0 to convert this column to numeric

In [None]:
df['mode'] = df['mode'].map({'Major': 1, 'Minor' : 0})

In [None]:
df.describe(include=['O']).T

## Artist Analysis

Lets make sure each row only has one artist and divide the songs with multiple artist to multiple rows with each artist in one row

In [None]:
df['artist(s)_name'] = df['artist(s)_name'].str.split(',')

In [None]:
# Convert list into multiple rows so each artist has a separate row with their track
# Let's also reset index so each row has is separated out
df = df.explode('artist(s)_name').reset_index(drop=True)
df['artist(s)_name'] = df['artist(s)_name'].str.strip()
df

In [None]:
df.drop('artist_count',axis=1,inplace=True)

In [None]:
df

## Duplicates?

In [None]:
df[df.duplicated(keep=False)]

In [None]:
df = df.drop_duplicates()

In [None]:
df[df.duplicated(keep=False)]

In [None]:
df = df.rename(columns={'artist(s)_name': 'artist_name'})

In [None]:
# Identify all numerical features and categorical features

# Numerical Features:
numerical_features = df.select_dtypes(exclude=['object']).columns.tolist()
print('The numerical features in the Spotify Dataset are:')
print(numerical_features)

# Categorical Features:

categorical_features = df.select_dtypes(include=['object']).columns.tolist()
print('\nThe Categorical features in the Spotify Dataset are:')
print(categorical_features)

# Plots

In [None]:
df.hist(column=df.columns, 
              color='red', # many options, some to consider are
                            # ['aqua', 'red', 'gold', 'royalblue', 
                            #'darkorange', 'green', 'purple', 
                            #'steelblue', 'yellow', 'lime', 'magenta']
              #edgecolor='black', # color of the edge line for the bars
              figsize=(20,20), #(x axis and y axis in inches)
              bins=10, # number of bins to divide the data into
              rwidth=0.9, # width between each bin
              grid=False # to remove the grid for cleaner visualization
             )
plt.show()

In [None]:
axis = df['mode'].value_counts().plot(kind='barh', figsize=(8,2), grid=False)
axis.bar_label(axis.containers[0], label_type='center', color='white', weight='bold')
plt.title('Spotify Track Modes')
plt.yticks([0,1],['Major', 'Minor'])
plt.xlabel('Track Counts')
plt.show()

In [None]:
feat = list(numerical_features)
feat.remove('mode')

plt.figure(figsize=(15,15))
for i,col in enumerate(feat,1):
    plt.subplot(5,4,i)
    plt.title(f"Distribution of {col}")
    sns.violinplot(data=df, x='mode', y=df[col])
    plt.tight_layout()
    plt.plot()

In [None]:
plt.figure(figsize=(18,15))
sns.heatmap(df[numerical_features].corr(),annot=True,fmt='.2f')
plt.title('Pairwise Correlation Map', size = 20)
plt.show()

This clearly shows some correlation, first lets explicitly state them all:

Positive Correlations:
- In Spotify Playlist and
    - in Deezer playlists
    - in Apple playlists
    - Streams
- In Spotify Charts and
    - in Deezer Charts
    - in Shazam Charts
- Streams and 
    - In Deezer Playlists
    - In Apple Playlists
    
- Danceability and 
    - Valence
    - Energy


In [None]:
# Lets only keep strong Correlations
c = df[numerical_features].corr()
plt.figure(figsize=(18,15))
sns.heatmap(c, mask = (np.abs(c) < 0.19),annot=True,fmt='.2f')
plt.show()

## Top Songs

In [None]:
top_10 = df[['track_name', 'streams']].sort_values(by='streams', ascending=False).head(10)

df[['track_name', 'streams']].sort_values(by='streams', ascending=False).head(20)

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x=top_10['streams'], y=top_10['track_name'], palette='RdBu')
plt.xlabel('Streams (in billions)')
plt.ylabel('Track Name')
plt.title('Top 20 Songs with Most Streams on Spotify')
plt.xticks(rotation=90)
plt.show()

## Top Artists

In [None]:
grouped = df[['artist_name', 'streams']].groupby(['artist_name']).sum('streams').reset_index()
grouped = grouped.sort_values('streams', ascending=False).head(15)
x=grouped['streams'].head(10)
y=grouped['artist_name'].head(10)

# Plot the values
plt.figure(figsize=(12, 6))
sns.barplot(x=x, y=y, palette='crest')
plt.xlabel('Streams (in billions)')
plt.ylabel('Artists')
plt.title('Top 10 Artists with Most Streams on Spotify')
plt.xticks(rotation=0)

plt.show()

In [None]:
df_2022 = df[df['released_year'] == 2022]

df_2022.reset_index(drop=True,inplace=True)

df_2022.drop('released_year',axis=1,inplace=True)
df_2022.head(10)

In [None]:
df_2023 = df[df['released_year'] == 2023]
df_2023.reset_index(drop=True,inplace=True)
df_2023.drop('released_year',axis=1,inplace=True)
df_2023.head(10)

In [None]:
x = df_2023.groupby('released_month')['released_month'].count().sort_values(ascending=False)

ax = x.plot(kind='bar', rot=0, figsize=(10,5), grid=False, color='Brown')
ax.bar_label(ax.containers[0], label_type='center', color='white', weight='bold')
plt.title('Count of Tracks in 2023 by Month')
plt.xlabel('Number of Tracks')
plt.ylabel('Months')
plt.show()

In [None]:
df_21_23 = df[df['released_year'].isin([2021, 2022, 2023])]
df_21_23.reset_index(inplace=True,drop=True)

In [None]:
df_21_23.head(5)

In [None]:
year_mon_pair = ['released_year','released_month']

x = df_21_23.groupby(year_mon_pair)[year_mon_pair].count()

ax = x.plot(kind='bar', rot=90, figsize=(15,5), grid=False, color='Brown')
ax.bar_label(ax.containers[0], padding=1, label_type='edge', 
             color='black',  rotation=0)
plt.title('Count of Tracks between 2021 and 2023 by Month')
plt.xlabel('Year and Months in (yyyy, mm)')
plt.ylabel('Number of Tracks')
plt.legend('',frameon=False)
plt.show()

In [None]:
artist_counts = df_21_23.groupby('artist_name')['artist_name'] \
                        .count().sort_values(ascending=False)
x = artist_counts.head(10).sort_values()
ax = x.head(10).plot(kind='barh', rot=0, figsize=(15,9), grid=False, color='green')
ax.bar_label(ax.containers[0], padding=1, label_type='center', weight='bold', 
             color='white', rotation=0)
plt.title('Count of Tracks by Artists between 2021 and 2023')
plt.xlabel('Number of Tracks', weight='bold')
plt.ylabel('Artist', weight='bold')
plt.legend('',frameon=False)
plt.show()

In [None]:
df_21_23[['danceability_%' , 'valence_%' , 'energy_%']]

In [None]:
correlation_matrix = df_21_23[['danceability_%', 'valence_%', 'energy_%']].corr()
print(correlation_matrix)

In [None]:

plt.figure(figsize=(10, 7))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
sns.pairplot(df_21_23[['danceability_%', 'valence_%', 'energy_%']])
plt.suptitle('Pairwise Scatter plots', y=1.02)  
plt.show()