In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os

#THEORETICAL FRAMEWORK --------------------------------
#https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023

#DATA SELECTION ----------------------------------
cwd = os.getcwd()
print(cwd)
os.chdir('C://Users//john8/Year 4/Data Analysis')

data= pd.read_csv("spotify-2023.csv", encoding='latin-1')

print(data.info())
print(data.head())
print(data.describe())
print("Number of rows:", data.shape[0])

#IMPUTATION OF MISSING DATA ----------------------------------------

#1. Handling Missing Values:
print("Missing values before handling:")
print(data.isnull().sum())

#track_name               0
#artist(s)_name           0
#artist_count             0
#released_year            0
#released_month           0
#released_day             0
#in_spotify_playlists     0
#in_spotify_charts        0
#streams                  0
#in_apple_playlists       0
#in_apple_charts          0
#in_deezer_playlists      0
#in_deezer_charts         0
#in_shazam_charts        50
#bpm                      0
#key                     95
#mode                     0
#danceability_%           0
#valence_%                0
#energy_%                 0
#acousticness_%           0
#instrumentalness_%       0
#liveness_%               0
#speechiness_%            0                                    

# decide to drop rows where above values are null
data.dropna(subset=['in_shazam_charts', 'key'], inplace=True)
print("\nMissing values after handling:")
print(data.isnull().sum())

# decide to drop rows where values are 0
#data = data[data != 0].dropna()
#data = data[(data != 0).all(axis=1)]

# remove rows with NaN or inf values
#data = data.replace([np.inf, -np.inf], np.nan).dropna()

#2. Outlier Detection and Treatment:

integer_columns = ['artist_count', 'released_year', 'released_month', 'released_day', 
                   'in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists',
                   'in_apple_charts', 'in_deezer_charts', 'bpm', 'danceability_%',
                   'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%',
                   'liveness_%', 'speechiness_%']

# Convert the selected columns to numeric data types
data[integer_columns] = data[integer_columns].apply(pd.to_numeric, errors='coerce')

# Calculate the z-scores for each numeric column
z_scores = np.abs((data[integer_columns] - data[integer_columns].mean()) / data[integer_columns].std())

# Define the threshold for outliers
threshold = 3

# Identify rows where any z-score exceeds the threshold
outliers_mask = (z_scores > threshold).any(axis=1)

# Print the number of rows before dropping outliers
print(f"Number of rows before dropping outliers: {len(data)}")

# Drop rows containing outliers
data.drop(data.index[outliers_mask], inplace=True)

# Print the number of rows dropped
print(f"Number of rows dropped due to outliers: {outliers_mask.sum()}")

# Print the cleaned DataFrame
print(data)

#3. Data Type Conversion:
data_types = data.dtypes
print(data_types)

# Convert 'streams' column to numeric data type
data['streams'] = pd.to_numeric(data['streams'], errors='coerce')

# Convert other object columns to string data type
object_columns = ['track_name', 'artist(s)_name', 'in_deezer_playlists', 'in_shazam_charts', 'key', 'mode']
data[object_columns] = data[object_columns].astype(str)

# Print updated data types
print(data.dtypes)

#4. Handling Duplicates:
duplicate_rows = data[data.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

#5. Feature Engineering:
# decided to make a new column called total playlists by combining in_spotify_playlists, in_apple_playlists and in_deezer_playlists which may be useful later on
# also decided to turn bpm into 3 categories based on number
import numpy as np

# check data types as we came across error when adding +
print("Data Types using info() method:")
print(data[['in_spotify_playlists', 'in_apple_playlists', 'in_deezer_playlists']].info())

print("Data Types using dtype attribute:")
print("in_spotify_playlists:", data['in_spotify_playlists'].dtype)
print("in_apple_playlists:", data['in_apple_playlists'].dtype)
print("in_deezer_playlists:", data['in_deezer_playlists'].dtype)

# change 'in_deezer_playlists' column to numeric data type from object to fix iussue
data['in_deezer_playlists'] = pd.to_numeric(data['in_deezer_playlists'], errors='coerce')

# drop if values are 0
data = data[data['in_deezer_playlists'] != 0]
data = data[data['in_apple_playlists'] != 0]
data = data[data['in_spotify_playlists'] != 0]

# verify data types after changing
print("Data Types after Conversion:")
print(data[['in_deezer_playlists']].info())

data['total_playlists'] = data['in_spotify_playlists'] + data['in_apple_playlists'] + data['in_deezer_playlists']

# turn 'bpm' into categories (low, medium, high)
print("Minimum BPM:", data['bpm'].min())
print("Maximum BPM:", data['bpm'].max())

bins = [65, 112, 159, 206]
labels = ['low', 'medium', 'high']

data['bpm_category'] = pd.cut(data['bpm'], bins=bins, labels=labels, right=True)

print(data['bpm_category'])

#6. Encoding Categorical Variables:
# change new bpm categories to numerical value
# Define a dictionary to map labels to numerical values
label_mapping = {'low': 0, 'medium': 1, 'high': 2}

data['bpm_category_encoded'] = data['bpm_category'].map(label_mapping)

data.dropna(subset=['bpm_category_encoded'], inplace=True)

data['bpm_category_encoded'] = data['bpm_category_encoded'].astype(int)


print(data[['bpm_category', 'bpm_category_encoded']].head())

In [None]:
# Multivariate analysis & Feature Selection ------------------------------------------------
print("Number of rows:", data.shape[0])
print(data.info())
print(data.head())
print(data.describe())

 #   Column                Non-Null Count  Dtype   
#---  ------                --------------  -----   
# 0   track_name            642 non-null    object  
# 1   artist(s)_name        642 non-null    object  
# 2   artist_count          642 non-null    int64   
# 3   released_year         642 non-null    int64   
# 4   released_month        642 non-null    int64   
# 5   released_day          642 non-null    int64   
# 6   in_spotify_playlists  642 non-null    int64   
# 7   in_spotify_charts     642 non-null    int64   
# 8   streams               642 non-null    int64   
# 9   in_apple_playlists    642 non-null    int64   
# 10  in_apple_charts       642 non-null    int64   
# 11  in_deezer_playlists   614 non-null    float64
# 12  in_deezer_charts      642 non-null    int64   
# 13  in_shazam_charts      642 non-null    object  
# 14  bpm                   642 non-null    int64   
# 15  key                   642 non-null    object  
# 16  mode                  642 non-null    object  
# 17  danceability_%        642 non-null    int64   
# 18  valence_%             642 non-null    int64   
# 19  energy_%              642 non-null    int64   
# 20  acousticness_%        642 non-null    int64   
# 21  instrumentalness_%    642 non-null    int64   
# 22  liveness_%            642 non-null    int64   
# 23  speechiness_%         642 non-null    int64   
# 24  total_playlists       614 non-null    float64 
# 25  bpm_category          641 non-null    category
# 26  bpm_category_encoded  641 non-null    int32
# dtypes: category(2), float64(2), int64(18), object(5)

# Correlation Coefficient of DV to IV variables
predictor_variables = ['streams']
response_variables = ['in_spotify_playlists',
                      'in_spotify_charts', 'in_apple_playlists', 
                      'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 
                      'artist_count', 'released_day', 'released_year', 'released_month', 'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%',
                       'liveness_%', 'speechiness_%', 'total_playlists']

# change columns to integer type based on values above
data = data.dropna(subset=['in_deezer_playlists'])
data['in_shazam_charts'] = data['in_deezer_playlists'].astype(int)

for predictor_var in predictor_variables:
    for response_var in response_variables:
        corr_coef, p_value = pearsonr(data[predictor_var], data[response_var])
        print(f"Pearson correlation coefficient between '{predictor_var}' and '{response_var}': {corr_coef:.2f}")

# Scatterplot of DV to IV variables
import matplotlib.pyplot as plt

predictor_var = 'streams'

for response_var in response_variables:
    plt.figure(figsize=(8, 6))
    plt.scatter(data[predictor_var], data[response_var], alpha=0.5, label='Data points')
    plt.title(f"{response_var} vs {predictor_var}")
    plt.xlabel(predictor_var)
    plt.ylabel(response_var)
    
    # fit a linear regression line
    m, b = np.polyfit(data[predictor_var], data[response_var], 1)
    plt.plot(data[predictor_var], m * data[predictor_var] + b, color='red', label='Regression line')
    
    plt.legend()
    plt.show()

# Summary of Correlation Coefficient of DV(streams) to IV variables

#Strong positive correlations (≥ 0.5):
#'in_spotify_playlists': 0.77
#'in_shazam_charts': 0.70
#'in_deezer_playlists': 0.70

#Moderate positive correlations (0.3 ≤ r < 0.5):
#'in_apple_playlists': 0.63
#'total_playlists': 0.78

#Weak positive correlations (0.1 ≤ r < 0.3):
#'released_month': 0.02
#'acousticness_%': 0.01
#'instrumentalness_%': 0.01

#Weak negative correlations (-0.3 ≤ r < -0.1):
#'artist_count': -0.11
#'released_day': 0.06
#'released_year': -0.39
#'danceability_%': -0.10
#'valence_%': -0.10
#'energy_%': -0.07
#'speechiness_%': -0.04

# IV to IV scatterplots and correlations want weak or low correlation
correlation_matrix_all = data[response_variables].corr()

# Plot heatmap of correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix_all, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Correlation Matrix of Response Variables")
plt.show()

# Summary of Correlation Coefficient of IV to IV variables
# in_spotify_playlist is highly correlated (0.77) with in_deezer_playlists & in_shazam_charts - top 3 major song platforms ->
# if a song is on a spotify playlist then theres 77% chance its in a deezer playlist OR shazam charts
# opposite is if its either in a deezer playlist OR shazam chart THEN theres 77% chance its in a spotify playlist
# UNEXPECTED 100% of in_deezer_playlists & in_shazam_charts - meaning if a song is in either one then it exists in the other one

# Investigate 100% collinearity of in_deezer_playlists & in_shazam_charts
#1. Check Unique Values: Verify if the values in both columns are identical for all observations. This can help confirm the observed collinearity.

unique_deezer = data['in_deezer_playlists'].unique()
unique_shazam = data['in_shazam_charts'].unique()

print("Unique values in 'in_deezer_playlists':", unique_deezer)
print("Unique values in 'in_shazam_charts':", unique_shazam)

# Check if unique values are identical
if len(unique_deezer) == 1 and len(unique_shazam) == 1 and unique_deezer[0] == unique_shazam[0]:
    print("The unique values in 'in_deezer_playlists' and 'in_shazam_charts' are identical for all observations.")
else:
    print("The unique values in 'in_deezer_playlists' and 'in_shazam_charts' are not identical for all observations.")


#2. Cross-Tabulation: Create a cross-tabulation (contingency table) between the two variables to see the frequency distribution of their values. This can provide additional insights into the relationship between the variables.
cross_tab = pd.crosstab(data['in_deezer_playlists'], data['in_shazam_charts'])

print("Cross-Tabulation (Contingency Table):")
print(cross_tab)
#In this case, the contingency table reveals that there is a one-to-one correspondence between the values of 'in_deezer_playlists' and 
#'in_shazam_charts'. Each unique value of 'in_deezer_playlists' corresponds to a unique value of 'in_shazam_charts', and vice versa.
#From a data analysis perspective, this indicates a perfect association between the two variables. In other words, whenever a song appears in 
#'in_deezer_playlists', it also appears in 'in_shazam_charts', and vice versa. 
#This could suggest either a strong relationship between these two variables or potential redundancy, as they may convey similar information.

# REVIEW AND CONCLUSION
# drop weak correlation values of DV to IV variables -> released_month, acousticness_%, instrumentalness_%, artist_count, released_day, released_year
# danceability_%, valence_%, energy_%, speechiness_%
# drop in_shazam_charts as multi collinearity exists between response variables
# so, although theres 88% collinearity of in_spotify_playlist & in_deezer_playlists - decide to keep in_deezer_playlists as it makes sense and needed for predicting streams

In [None]:
# Normalization or Standardization ----------------------------------

#normalise
#https://www.digitalocean.com/community/tutorials/normalize-data-in-python
from sklearn import preprocessing
import pandas as pd

response_variables = ['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 
                      'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts']

variables_to_normalize = response_variables

data_to_normalize = data[variables_to_normalize]

normalized_data = preprocessing.normalize(data_to_normalize, axis=0)

normalized_df = pd.DataFrame(normalized_data, columns=variables_to_normalize)

#check if data is normalised

import numpy as np

# Assuming 'normalized_df' is your DataFrame containing the normalized variables
magnitudes = np.linalg.norm(normalized_df, axis=0)

# Print the magnitudes,  for properly normalized data, the magnitudes should be close to 1.
for var, magnitude in zip(normalized_df.columns, magnitudes):
    print(f"Magnitude of '{var}': {magnitude:.4f}")

In [None]:
# Regression Modelling ----------------------------
  #Conduct simple linear regressions for each IV/DV pair
   # Use the non-redundant independent variables in the analysis to find the best fitting model
    # Use the best fitting model to make predictions about the dependent variable
import statsmodels.api as sm
all_data = sm.add_constant(data['streams'])
model = sm.OLS(data['in_spotify_playlists'], all_data)
result = model.fit()
print(result.summary())
# Line below is for the standard error
print(result.scale**0.5)

# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

In [None]:
# PCA -------------------------------

predictor_variables = ['streams']
response_variables = ['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 
                      'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts']

variables_to_scale = predictor_variables + response_variables

data_to_scale = data[variables_to_scale]

scaler = StandardScaler()

scaled_data = scaler.fit_transform(data_to_scale)

scaled_df = pd.DataFrame(scaled_data, columns=variables_to_scale)

plt.scatter(data['in_spotify_playlists'], data['in_deezer_playlists'])
plt.title("in_spotify_playlists vs in_deezer_playlists")
plt.xlabel("in_spotify_playlists")
plt.ylabel("in_deezer_playlists")
in_spotify_playlistsaverage = np.average(data['in_spotify_playlists'])
in_deezer_playlistsaverage = np.average(data['in_deezer_playlists'])
plt.plot(in_spotify_playlistsaverage, in_deezer_playlistsaverage, 'rx')


gene1 = data['in_spotify_playlists'] - in_spotify_playlistsaverage
gene2 = data['in_deezer_playlists'] - in_deezer_playlistsaverage
print(gene1)
print(gene2)


plt.scatter(gene1, gene2)
plt.title(" in_spotify_playlists Gene 1 vs in_deezer_playlists Gene 2")
plt.xlabel(" in_spotify_playlists Gene 1")
plt.ylabel("in_deezer_playlists Gene 2")

from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(gene1, gene2)
print(slope)

print(intercept)

In [None]:
# Cluster Analysis ------------------------------------