In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os

#THEORETICAL FRAMEWORK --------------------------------
#https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023

#DATA SELECTION ----------------------------------
cwd = os.getcwd()
print(cwd)
os.chdir('C://Users//john8/Year 4/Data Analysis')

data= pd.read_csv("spotify-2023.csv", encoding='latin-1')

print(data.info())
print(data.head())
print(data.describe())

#IMPUTATION OF MISSING DATA ----------------------------------------

#1. Handling Missing Values:
print("Missing values before handling:")
print(data.isnull().sum())

#track_name               0
#artist(s)_name           0
#artist_count             0
#released_year            0
#released_month           0
#released_day             0
#in_spotify_playlists     0
#in_spotify_charts        0
#streams                  0
#in_apple_playlists       0
#in_apple_charts          0
#in_deezer_playlists      0
#in_deezer_charts         0
#in_shazam_charts        50
#bpm                      0
#key                     95
#mode                     0
#danceability_%           0
#valence_%                0
#energy_%                 0
#acousticness_%           0
#instrumentalness_%       0
#liveness_%               0
#speechiness_%            0                                    

#decide to drop rows where above values are null
data.dropna(subset=['in_shazam_charts', 'key'], inplace=True)
print("\nMissing values after handling:")
print(data.isnull().sum())

#2. Outlier Detection and Treatment:

integer_columns = ['artist_count', 'released_year', 'released_month', 'released_day', 
                   'in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists',
                   'in_apple_charts', 'in_deezer_charts', 'bpm', 'danceability_%',
                   'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%',
                   'liveness_%', 'speechiness_%']

# Convert the selected columns to numeric data types
data[integer_columns] = data[integer_columns].apply(pd.to_numeric, errors='coerce')

# Calculate the z-scores for each numeric column
z_scores = np.abs((data[integer_columns] - data[integer_columns].mean()) / data[integer_columns].std())

# Define the threshold for outliers
threshold = 3

# Identify rows where any z-score exceeds the threshold
outliers_mask = (z_scores > threshold).any(axis=1)

# Print the number of rows before dropping outliers
print(f"Number of rows before dropping outliers: {len(data)}")

# Drop rows containing outliers
data.drop(data.index[outliers_mask], inplace=True)

# Print the number of rows dropped
print(f"Number of rows dropped due to outliers: {outliers_mask.sum()}")

# Print the cleaned DataFrame
print(data)

#3. Data Type Conversion:
data_types = data.dtypes
print(data_types)

# Convert 'streams' column to numeric data type
data['streams'] = pd.to_numeric(data['streams'], errors='coerce')

# Convert other object columns to string data type
object_columns = ['track_name', 'artist(s)_name', 'in_deezer_playlists', 'in_shazam_charts', 'key', 'mode']
data[object_columns] = data[object_columns].astype(str)

# Print updated data types
print(data.dtypes)

#4. Handling Duplicates:
duplicate_rows = data[data.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

#5. Feature Engineering:
# decided to make a new column called total playlists by combining in_spotify_playlists, in_apple_playlists and in_deezer_playlists which may be useful later on
# also decided to turn bpm into 3 categories based on number
import numpy as np

# check data types as we came across error when adding +
print("Data Types using info() method:")
print(data[['in_spotify_playlists', 'in_apple_playlists', 'in_deezer_playlists']].info())

print("Data Types using dtype attribute:")
print("in_spotify_playlists:", data['in_spotify_playlists'].dtype)
print("in_apple_playlists:", data['in_apple_playlists'].dtype)
print("in_deezer_playlists:", data['in_deezer_playlists'].dtype)

# change 'in_deezer_playlists' column to numeric data type from object to fix iussue
data['in_deezer_playlists'] = pd.to_numeric(data['in_deezer_playlists'], errors='coerce')

# drop if values are 0
data = data[data['in_deezer_playlists'] != 0]
data = data[data['in_apple_playlists'] != 0]
data = data[data['in_spotify_playlists'] != 0]

# verify data types after changing
print("Data Types after Conversion:")
print(data[['in_deezer_playlists']].info())

data['total_playlists'] = data['in_spotify_playlists'] + data['in_apple_playlists'] + data['in_deezer_playlists']

# turn 'bpm' into categories (low, medium, high)
bins = [0, 100, 150, np.inf]
labels = ['low', 'medium', 'high']
data['bpm_category'] = pd.cut(data['bpm'], bins=bins, labels=labels, right=False)

print(data)


#6. Normalization or Standardization:
#Standardize or normalize numerical features if your models require it.
#Ensure that features are on similar scales to prevent models from being biased towards features with larger magnitudes.
#7. Encoding Categorical Variables:
#Encode categorical variables into numerical format using techniques like one-hot encoding or label encoding, depending on the nature of the data and the requirements of your models.
#8. Feature Selection:
#Perform feature selection to identify the most relevant features for your predictive modeling tasks. This can help reduce model complexity and improve model performance.

C:\Users\john8\Year 4\Data Analysis
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key 

In [35]:
predictor_variables = ['streams']
response_variables = ['in_spotify_playlists',
                      'in_spotify_charts', 'in_apple_playlists', 
                      'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 
                      'artist_count', 'released_year', 'released_month', 'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%',
                       'liveness_%', 'speechiness_%']

# Convert columns to numeric and handle missing values
data[predictor_variables] = data[predictor_variables].apply(pd.to_numeric, errors='coerce')
data[response_variables] = data[response_variables].apply(pd.to_numeric, errors='coerce')

# Remove rows with NaN or inf values
data = data.replace([np.inf, -np.inf], np.nan).dropna()

for predictor_var in predictor_variables:
    for response_var in response_variables:
        corr_coef, p_value = pearsonr(data[predictor_var], data[response_var])
        print(f"Pearson correlation coefficient between '{predictor_var}' and '{response_var}': {corr_coef:.2f}")

#==================================================

#in_spotify_playlists
#See if theres outliers for the in_spotify_playlists -
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.in_spotify_playlists)
plt.title("Boxplot of in_spotify_playlists")
plt.ylabel("in_spotify_playlists")
plt.show()

#in_spotify_charts
#See if theres outliers for the in_spotify_charts - 
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.in_spotify_charts)
plt.title("Boxplot of in_spotify_charts")
plt.ylabel("in_spotify_charts")
plt.show()

#in_apple_playlists
#See if theres outliers for the in_apple_playlists -
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.in_apple_playlists)
plt.title("Boxplot of in_apple_playlists")
plt.ylabel("in_apple_playlists")
plt.show()

#in_apple_charts
#See if theres outliers for the in_apple_charts -
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.in_apple_charts)
plt.title("Boxplot of in_apple_charts")
plt.ylabel("in_apple_charts")
plt.show()

#in_deezer_playlists
#See if theres outliers for the in_deezer_playlists - 
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.in_deezer_playlists)
plt.title("Boxplot of in_deezer_playlists")
plt.ylabel("in_deezer_playlists")
plt.show()

#in_deezer_charts
#See if theres outliers for the in_deezer_charts -
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.in_deezer_charts)
plt.title("Boxplot of in_deezer_charts")
plt.ylabel("in_deezer_charts")
plt.show()

#in_shazam_charts
#See if theres outliers for the in_shazam_charts - its ok
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.in_shazam_charts)
plt.title("Boxplot of in_shazam_charts")
plt.ylabel("in_shazam_charts")
plt.show()

#------------------------------------------

# Scatter plot of streams vs in_spotify_playlists
plt.figure(figsize=(8, 6))
plt.title("streams vs in_spotify_playlists")
plt.xlabel("streams")
plt.ylabel("in_spotify_playlists")
plt.scatter(data['streams'], data['in_spotify_playlists'], alpha=0.5)
plt.plot(np.unique(data['streams']), np.poly1d(np.polyfit(data['streams'], data['in_spotify_playlists'], 1))(np.unique(data['streams'])), color='red')


# Scatter plot of streams vs in_spotify_charts
plt.figure(figsize=(8, 6))
plt.title("streams vs in_spotify_charts")
plt.xlabel("streams")
plt.ylabel("in_spotify_charts")
plt.scatter(data['streams'], data['in_spotify_charts'], alpha=0.5)
plt.plot(np.unique(data['streams']), np.poly1d(np.polyfit(data['streams'], data['in_spotify_charts'], 1))(np.unique(data['streams'])), color='red')



# Scatter plot of streams vs in_apple_playlists
plt.figure(figsize=(8, 6))
plt.title("streams vs in_apple_playlists")
plt.xlabel("streams")
plt.ylabel("in_apple_playlists")
plt.scatter(data['streams'], data['in_apple_playlists'], alpha=0.5)
plt.plot(np.unique(data['streams']), np.poly1d(np.polyfit(data['streams'], data['in_apple_playlists'], 1))(np.unique(data['streams'])), color='red')


# Scatter plot of wins vs in_deezer_playlists
plt.figure(figsize=(8, 6))
plt.title("streams vs in_deezer_playlists")
plt.xlabel("streams")
plt.ylabel("in_deezer_playlists")
plt.scatter(data['streams'], data['in_deezer_playlists'], alpha=0.5)
plt.plot(np.unique(data['streams']), np.poly1d(np.polyfit(data['streams'], data['in_deezer_playlists'], 1))(np.unique(data['streams'])), color='red')



# Scatter plot of streams vs in_deezer_charts
plt.figure(figsize=(8, 6))
plt.title("streams vs in_deezer_charts")
plt.xlabel("streams")
plt.ylabel("in_deezer_charts")
plt.scatter(data['streams'], data['in_deezer_charts'], alpha=0.5)
plt.plot(np.unique(data['streams']), np.poly1d(np.polyfit(data['streams'], data['in_deezer_charts'], 1))(np.unique(data['streams'])), color='red')



# Scatter plot of streams vs in_shazam_charts
plt.figure(figsize=(8, 6))
plt.title("streams vs in_shazam_charts")
plt.xlabel("streams")
plt.ylabel("in_shazam_charts")
plt.scatter(data['streams'], data['in_shazam_charts'], alpha=0.5)
plt.plot(np.unique(data['streams']), np.poly1d(np.polyfit(data['streams'], data['in_shazam_charts'], 1))(np.unique(data['streams'])), color='red')

#===========================================================================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame containing the independent variables
response_variables = ['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 
                      'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 
                      'artist_count', 'released_year', 'released_month', 'bpm', 'danceability_%', 
                      'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 
                      'speechiness_%']

# Select the independent variables
independent_variables = response_variables

# Calculate the correlation matrix
correlation_matrix = data[independent_variables].corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Independent Variables')
plt.show()

# Create scatterplots between pairs of independent variables
sns.pairplot(data[independent_variables])
plt.title('Pairplot of Independent Variables')
plt.show()




In [None]:
#regression modelling

In [None]:
#normalise
#https://www.digitalocean.com/community/tutorials/normalize-data-in-python
from sklearn import preprocessing
import pandas as pd

response_variables = ['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 
                      'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts']

variables_to_normalize = response_variables

data_to_normalize = data[variables_to_normalize]

normalized_data = preprocessing.normalize(data_to_normalize, axis=0)

normalized_df = pd.DataFrame(normalized_data, columns=variables_to_normalize)

#check if data is normalised

import numpy as np

# Assuming 'normalized_df' is your DataFrame containing the normalized variables
magnitudes = np.linalg.norm(normalized_df, axis=0)

# Print the magnitudes,  for properly normalized data, the magnitudes should be close to 1.
for var, magnitude in zip(normalized_df.columns, magnitudes):
    print(f"Magnitude of '{var}': {magnitude:.4f}")

#--------------------------------------------------
#scale data & PCA
# Assuming 'data' is your DataFrame containing all the variables
predictor_variables = ['streams']
response_variables = ['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 
                      'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts']

variables_to_scale = predictor_variables + response_variables

data_to_scale = data[variables_to_scale]

scaler = StandardScaler()

scaled_data = scaler.fit_transform(data_to_scale)

scaled_df = pd.DataFrame(scaled_data, columns=variables_to_scale)

plt.scatter(data['in_spotify_playlists'], data['in_deezer_playlists'])
plt.title("in_spotify_playlists vs in_deezer_playlists")
plt.xlabel("in_spotify_playlists")
plt.ylabel("in_deezer_playlists")
in_spotify_playlistsaverage = np.average(data['in_spotify_playlists'])
in_deezer_playlistsaverage = np.average(data['in_deezer_playlists'])
plt.plot(in_spotify_playlistsaverage, in_deezer_playlistsaverage, 'rx')


gene1 = data['in_spotify_playlists'] - in_spotify_playlistsaverage
gene2 = data['in_deezer_playlists'] - in_deezer_playlistsaverage
print(gene1)
print(gene2)


plt.scatter(gene1, gene2)
plt.title(" in_spotify_playlists Gene 1 vs in_deezer_playlists Gene 2")
plt.xlabel(" in_spotify_playlists Gene 1")
plt.ylabel("in_deezer_playlists Gene 2")



In [None]:
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(gene1, gene2)
print(slope)

print(intercept)

In [None]:
#======================================
import statsmodels.api as sm
all_data = sm.add_constant(data['streams'])
model = sm.OLS(data['in_spotify_playlists'], all_data)
result = model.fit()
print(result.summary())
# Line below is for the standard error
print(result.scale**0.5)

In [None]:
# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

In [None]:
# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

In [None]:
# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

In [None]:
# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

In [None]:
# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)

In [None]:
# Second regression analysis with both 'streams' and 'in_spotify_charts' predicting 'streams'
data2 = sm.add_constant(data[['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts']])
model2 = sm.OLS(data['streams'], data2)
result2 = model2.fit()
print(result2.summary())
# Standard error
print(result2.scale**0.5)
print(result2.ssr)