In [None]:
import string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
sns.set(style="darkgrid")

# Set display options to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Set printing options to show all elements
np.set_printoptions(threshold=np.inf)

In [None]:
dataset = pd.read_csv('csvs/dataset.csv')

In [None]:
# 1. Get a brief information on the data.
dataset.columns.values[0] = 'index'

print('\nThe columns are:  ')
print(dataset.columns)

print('\nNumber of X tuples = {}'.format(dataset.shape[0]))

print('\n')
print(dataset.info())
print('\n')
print(dataset.head())

In [None]:
# 2. Get all possible genre value
print('number of genre : {}'.format(dataset.track_genre.nunique()))
print(dataset.track_genre.unique())

In [None]:
'''
3. Begin data preprocessing
'''
# Normalize data between [0-1]
def minMax_normalization(column):
    clean_dataset[column] = (clean_dataset[column] - clean_dataset[column].min()) / (clean_dataset[column].max() - clean_dataset[column].min())
    return clean_dataset[column]    

# Drop the row where track_name = null
dataset.drop(dataset.index[dataset['track_name'].isnull()], inplace=True)   

# Operate on second set of data
clean_dataset=dataset.copy()

# Sort by popularity first, so when we drop duplicate we drop lower popularity
# Drop duplicate if track_name, duration_ms, artists and track_genre are all the same
clean_dataset.sort_values(by=['popularity'],ascending=False,inplace=True)
clean_dataset.drop_duplicates(subset=['track_name','duration_ms','artists','track_genre'],inplace=True)

# If track_name, duration_ms and artists are same, but genre is different, aggregate the genre
clean_dataset = clean_dataset.groupby(['track_name','duration_ms','artists'],as_index=False).agg({'track_genre':lambda x: ','.join(x),
                                                                                                  'index': 'first',
                                                                                                  'track_id': 'first',
                                                                                                  'popularity': 'max',
                                                                                                  'explicit': 'first',
                                                                                                  'danceability': 'first',
                                                                                                  'energy': 'first',
                                                                                                  'loudness': 'first',
                                                                                                  'speechiness': 'first',
                                                                                                  'acousticness': 'first',
                                                                                                  'instrumentalness': 'first',
                                                                                                  'liveness': 'first',
                                                                                                  'valence': 'first',
                                                                                                  'tempo': 'first',
                                                                                                  'key': 'first',
                                                                                                  'mode': 'first'})


# Drop the song where genre is sleep only, since those song target ASMR audiences
clean_dataset.drop(clean_dataset.index[clean_dataset['track_genre'] == 'sleep'], inplace=True)

# Remove row where tempo is 0
clean_dataset.drop(clean_dataset.index[clean_dataset['tempo'] == 0], inplace=True)

# Normalise popularity column
minMax_normalization('popularity')

# Binarise explicit column to 0 and 1, if unknown, set to null
clean_dataset['explicit'] = clean_dataset['explicit'].apply(lambda x: 1 if x == 1 else (0 if x == 0 else None))

# Rearrange the columns
clean_dataset = clean_dataset[['track_id','track_name','artists','duration_ms','popularity','track_genre','explicit','danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo', 'key', 'mode']]
clean_dataset.reset_index(inplace=True)
print(clean_dataset.info())

In [None]:
# Export the processed data to csv
clean_dataset.to_csv('csvs/clean_data.csv',index=False)

In [None]:
# Sort the popularity with more than 95 scores in descending order
clean_dataset.describe().transpose()
popular_song = clean_dataset.query('popularity>=0.95', inplace=False).sort_values('popularity',ascending=False)
print(popular_song)

In [None]:
# Sort the popularity and genre to determine the most popular genres.
# Genre is stored as an array from previous processing.
genre_popularity = clean_dataset.copy()
genre_popularity['track_genre'] = genre_popularity['track_genre'].apply(lambda x: x.split(','))
genre_popularity = genre_popularity.explode('track_genre')
print(genre_popularity.info())

In [None]:
# Calculate the average popularity for each genre and sorts them in descending order. 
avg_popularity_genre = genre_popularity.groupby('track_genre')['popularity'].mean()
avg_popularity_genre_sorted = avg_popularity_genre.sort_values(ascending=False)
most_popular_genres = avg_popularity_genre_sorted.head(10)

In [None]:
# Histogram showing the average popularity per genre
plt.figure(figsize=(10,6))
sns.barplot(x=most_popular_genres.index, y=most_popular_genres.values, palette="hls")
plt.title("Average Popularity per Genre")
plt.xlabel('Genre')
plt.ylabel('Average Popularity')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Truncate genre_popularity to only include the most popular genres, and list from most popular to least popular
genre_popularity.loc[genre_popularity['track_genre'].isin(most_popular_genres.index)]

# Energy distribution in different genres 
plt.figure(figsize=(10, 6))

# Box plot or violin plot for high energy genres
sns.boxplot(x='track_genre', y='energy', data=genre_popularity[genre_popularity['track_genre'].isin(most_popular_genres.index)], palette="hls")
plt.title("Distribution of Energy in High Energy Genres")
plt.xlabel("Genre")
plt.ylabel("Energy")
plt.xticks(rotation=90)

plt.show()

In [None]:
# Correlation between energy and popularity
plt.figure(figsize=(10, 8))
sns.scatterplot(x='energy', y='popularity', hue='track_genre', data=genre_popularity[genre_popularity['track_genre'].isin(most_popular_genres.index)], alpha=0.1, palette='hls')
plt.title("Energy vs. Popularity by Genre")
plt.xlabel("Energy")
plt.ylabel("Popularity")
plt.show()

In [None]:
# Correlation between valence and energy
plt.figure(figsize=(10, 8))
sns.scatterplot(x='valence', y='danceability', hue='track_genre', data=genre_popularity[genre_popularity['track_genre'].isin(most_popular_genres.index)], alpha=0.1, palette='hls')
plt.title("Valence vs. Danceability")
plt.xlabel("Valence")
plt.ylabel("Danceability")
plt.show()

In [None]:
# Correlation between energy vs tempo
plt.figure(figsize=(10, 8))
sns.scatterplot(x='energy', y='tempo', hue='track_genre', data=genre_popularity[genre_popularity['track_genre'].isin(most_popular_genres.index)], alpha=0.1, palette='hls')
plt.title("Energy vs. Tempo")
plt.xlabel("Energy")
plt.ylabel("Tempo")
plt.show()

In [None]:
# Truncate genre_popularity to only include the most popular genres, and list from most popular to least popular
genre_popularity.loc[genre_popularity['track_genre'].isin(most_popular_genres.index)]

# Energy distribution in different genres 
plt.figure(figsize=(10, 6))

# Box plot or violin plot for high energy genres
sns.boxplot(x='mode', y='valence', data=genre_popularity, palette="hls")
plt.title("Distribution of valence through mode")
plt.xlabel("Mode")
plt.ylabel("Valence")
plt.xticks(rotation=90)

plt.show()

In [None]:
# Truncate genre_popularity to only include the most popular genres, and list from most popular to least popular
genre_popularity.loc[genre_popularity['track_genre'].isin(most_popular_genres.index)]

# Energy distribution in different genres 
plt.figure(figsize=(10, 6))

# Box plot or violin plot for high energy genres
sns.boxplot(x='key', y='track_genre', data=genre_popularity, palette="hls")
plt.title("Distribution of key in genres")
plt.xlabel("Key")
plt.ylabel("Track Genre")
plt.xticks(rotation=90)

plt.show()

In [None]:
# Truncate genre_popularity to only include the most popular genres, and list from most popular to least popular
genre_popularity.loc[genre_popularity['track_genre'].isin(most_popular_genres.index)]

# Energy distribution in different genres 
plt.figure(figsize=(10, 6))

# Box plot or violin plot for each genre's instrumentality
sns.boxplot(x='track_genre', y='instrumentalness', data=genre_popularity, palette="hls")
plt.title("Distribution of instrumentalness within genres")
plt.xlabel("Genre")
plt.ylabel("Instrumentalness")
plt.xticks(rotation=90)

plt.show()