Data can be found on https://www.kaggle.com/cihanoklap/top-songs-on-spotify-what-makes-them-popular/data

In this exercise, we’ll take a look at the audio features of the tracks in Spotify’s Top Songs of 2017 playlist and try to highlight the common patterns behind the audio features of these songs.


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Read the file data.csv
# Show the first few records
path = 'https://raw.githubusercontent.com/HOGENT-Databases/DB3-Workshops/master/data/'
data = pd.read_csv(path+"spotify.csv", encoding="ISO-8859-1")
data.head()



In [None]:
# Show the general information about the data
data.describe()

In [None]:
# Drop the column URL because it is of no need
# Show the first few records
data = data.drop('URL', axis = 1)
data.head()


In [None]:
# Give the dimensions of the data
data.shape

In [None]:
# Give for each column the number of values available
data.count()

In [None]:
# You can see that there are some columns with no data available for the Track Name and Artist
# Drop the records which empty values for some columns
data = data.dropna()

In [None]:
# Give for each column the number of values available
data.count()

In [None]:
# Give the datatype of each column
data.dtypes

In [None]:
# Which different regions are available. Is Belgium (be) one of these regions?
data['Region'].unique()

In [None]:
# What is the minimum date?
data['Date'].min()

In [None]:
# What is the maximum date?
data['Date'].max()

In [None]:
# Transform the Track Name's into lower case
data['Track Name'] = data['Track Name'].str.lower()
data.head()


In [None]:
# Transform the Artist into lower case
data['Artist'] = data['Artist'].str.lower()
data.head()

In [None]:
# How many unique artists are there in the dataset?
len(data['Artist'].unique())

In [None]:
# Are there any songs with the same name but with a different artist?
result = data.groupby('Track Name')['Artist'].unique().reset_index()
result['Artist'] = result['Artist'].map(len)
result = result[result['Artist'] > 1]
result.head()


In [None]:
# What was the most often streamed song in the entire world?
data.groupby(['Track Name', 'Artist'])['Streams'].sum().reset_index().sort_values(by='Streams',ascending=False).head()

In [None]:
# Now it looks like Despacito of Luis Fonsi was less popular than Shape of You of Ed Sheeran
# Give all the different versions in the dataset of the song Despacito
data[(data['Track Name'].str.startswith('despacito'))]

In [None]:
# How many times was Despacito streamed over the entire world if you make no difference between the different versions?
data[(data['Track Name'].str.startswith('despacito'))]['Streams'].sum()

In [None]:
# What was the most popular of the 4 different versions of despacito?
result = data[(data['Track Name'].str.startswith('despacito'))]
result = result.groupby('Track Name')['Streams'].sum().reset_index().sort_values(by='Streams',ascending=False)
result.head()


In [None]:
# Is the version of despacito with Justin Bieber also available in the dataset?
data[(data['Track Name'].str.startswith('despacito')) & (data['Artist'] == 'justin bieber')]

In [None]:
# Give a list of songs that were only streamed in Belgium
result = data.groupby('Track Name')['Region'].unique().reset_index()
result['Region'] = result['Region'].map(', '.join)
result[result['Region'] == 'be'].head()


In [None]:
# Show the christmas songs that are in the dataset (i.e. the Track Name contains the word christmas)
data[(data['Track Name'].str.contains('christmas'))].groupby(['Track Name', 'Artist']).count()

In [None]:
# Was 'all i want for christmas is you' streamed at all during july and august?
data[((data['Date'].str[5:7] == '07') | (data['Date'].str[5:7] == '08')) & (data['Track Name'] == 'all i want for christmas is you')]

In [None]:
# On which day was 'all i want for christmas is you' streamed for the first time after the summer (i.e. august)?
data[(data['Date'].str[5:7] > '08') & (data['Track Name'] == 'all i want for christmas is you')]['Date'].min()

In [None]:
# How did the songs of Ed Sheeran perform in 2017 in Belgium on each day: give for each track the number of days it was streamed in belgium.
data[(data['Region'] == 'be') & (data['Artist'] == 'ed sheeran')].groupby('Track Name')['Date'].count().reset_index().sort_values(by = 'Date', ascending=False)

In [None]:
# Give the artists that have more than one song on the Top 100 Songs List in Belgium
result = data[(data['Region'] == 'be') & (data['Position'] <= 100)].groupby('Artist')['Track Name'].describe()
result[result['unique'] >= 2]['unique'].head()


In [None]:
# Create a new dataframe plotData that contains the Date and Streams of despacito in belgium
# Transform the column Date to_datetime
# Add a new column Week to plotData that contains the week (use .dt.week)
# Drop the column Date because it's of no use anymore
# Calculate the average number of Streams per week. Divide the result by 1000 (to get smaller numbers) 
# and transform the result to an integer

plotData = data[(data['Track Name'].str.startswith('despacito')) & (data['Region'] == 'be')][['Date', 'Streams']]
plotData['Date'] = pd.to_datetime(plotData['Date'])
plotData['Week'] = plotData['Date'].dt.week
plotData = plotData.drop('Date', axis = 1)
plotData = plotData.groupby('Week')['Streams'].mean().reset_index()
plotData['Streams'] /= 1000
plotData['Streams'] = plotData['Streams'].astype(int)
plotData.head(20)


In [None]:
# Plot the resulting data

# imports for plotting

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

%matplotlib inline

plt.scatter(plotData['Week'], plotData['Streams'])
plt.xlabel('Week')
plt.ylabel('Streams (x 1000)')
plt.show()


In [None]:
# Create a new dataframe extra that contains the most often streamed Track Name's for each month 
# for each of the following regions: au, be, de, fr, gb, it, nl
# Transform the column Date to_datetime
# Add a new column Week to plotData that contains the week (use .dt.week)
# Drop the column Date because it's of no use anymore

extra = data[data['Region'].isin(['au', 'be', 'de', 'fr', 'gb', 'it', 'nl'])][['Region','Date', 'Streams','Track Name']]
extra['Date'] = pd.to_datetime(extra['Date'])
extra['Month'] = extra['Date'].dt.month 
extra.drop('Date', axis = 1)
extra = extra.groupby(['Region','Month','Track Name'])['Streams'].sum().reset_index().sort_values(by='Streams',ascending=False)
extra.head()

In [None]:
# Create a new dataframe extra2 based on the dataframe extra that contains all information about the 4 versions of despacito
# Create a pivot_table that shows the total number of streams of the 4 versions of despacito 
# for each region (au, be, de, fr, gb, it, nl) for each month

extra2 = extra[extra['Track Name'].str.startswith('despacito')]
pd.pivot_table(extra2, values='Streams', index=['Month'],columns=['Region'], aggfunc='sum').fillna(0)


In [None]:
# Create a new dataframe extra3 based on the dataframe extra that contains the most popular Track Name
# for each region (au, be, de, fr, gb, it, nl) for each month

extra3 = extra.drop('Streams', axis = 1)
extra3 = extra3.drop_duplicates(['Region','Month'], keep='first')
extra3 = extra.sort_values(by=['Region','Month'], ascending=True)
extra3.head()


In [None]:
# Create a new dataframe extra4 based on the dataframe data that only contains the Track Name's
# Make sure there are no duplicates for the Track Name's

extra4 = data[['Track Name']]
extra4 = extra4.drop_duplicates(keep='first')
extra4.head()


In [None]:
pip install langdetect

In [None]:
# Detect the language for each Track Name in the dataframe extra4
# You have to use try - except otherwise you can get the LangDetectException: No features in text.
# for weird titles like #1111, because he can't decide on the language

import nltk
from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return 'NoLang'

extra4['lang'] = extra4['Track Name'].map(detect_language)

In [None]:
# We will use only the english Track Name's

extra4 = extra4[extra4['lang'] == 'en']
extra4.head()


In [None]:
# Create a new dataframe extra5 based on the dataframe data that contains the total number of streams per Track Name
# and per artist
extra5 = data.groupby(['Track Name', 'Artist'])['Streams'].sum().reset_index()
extra5.head()


In [None]:
# Read the file featuresdf.csv into the dataframe features
# Show the few first records
features = pd.read_csv('/content/gdrive/My Drive/featuresdf.csv')
features.head()


In [None]:
# Drop the column id
features = features.drop('id', axis = 1)
features.head()


In [None]:
# Transform the columns name and artists to lower case
features['name'] = features['name'].str.lower()
features['artists'] = features['artists'].str.lower()
features.head()


In [None]:
# Merge the dataframe features and extra5 into the dataframe new_df
# Show the few first records
new_df = pd.merge(features, extra5,  how='left', left_on=['name','artists'], right_on = ['Track Name','Artist'])
new_df.head()


In [None]:
# Divide the number of streams by 1000000 to get smaller numbers
new_df['Streams'] /= 1000000
new_df.head()

In [None]:
# We want to create a linear regression model to try to predict the streams 
# Drop the columns that aren't necessary any more
new_df = new_df.drop(['Track Name', 'Artist', 'name', 'artists'], axis = 1)

In [None]:
# remove records with empty values
new_df = new_df.dropna()

In [None]:
# Create the training and test data
from sklearn.model_selection import train_test_split
X = new_df.drop('Streams',axis=1)
y = new_df['Streams']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)


In [None]:
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline


In [None]:
poly = PolynomialFeatures(degree=3)
X_train_transform = poly.fit_transform(X_train)
X_test_transform = poly.fit_transform(X_test)

model = LinearRegression()
model.fit(X_train_transform,y_train)


In [None]:
y_predict = model.predict(X_test_transform)

MAE = metrics.mean_absolute_error(y_test,y_predict)
print('Mean Absolute Error: '+ str(MAE))
print()

MSE = metrics.mean_squared_error(y_test,y_predict)
print('Mean Squared Error: '+ str(MSE))
print()

RMSE = np.sqrt(metrics.mean_squared_error(y_test,y_predict))
print('Root Mean Squared Error: '+ str(RMSE))
print()

r2 = metrics.r2_score(y_test,y_predict)
print('R square: ' + str(r2))  
print()


In [None]:
# Create a RandomForestRegressor to predict the number of Streams
# What are the most important features to predict if a song will become a hit?

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor


In [None]:
model = RandomForestRegressor(n_estimators=300)
model.fit(X_train,y_train)
y_predict = model.predict(X_test)

mae = mean_absolute_error(y_test,y_predict)

print('Mean Absolute Error: '+ str(MAE))
r2 = r2_score(y_test,y_predict)
print('R square: ' + str(r2))  


In [None]:
print(X_train.columns)
print(model.feature_importances_)
