In [58]:
# Import the 'warnings' module to manage warnings in the code
import warnings
# Ignore all warnings to prevent them from being displayed during code execution
warnings.filterwarnings('ignore')
# Import the required libraries for data analysis and visualization
import numpy as np
import pandas as pd
import matplotlib
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
# Import the 'StandardScaler' class from the'sklearn.preprocessing' module
from sklearn.preprocessing import StandardScaler

In [59]:
# Open the "spotify-2023.csv" CSV file and read it into the Pandas DataFrame ('df_sportify').
# To address character encoding issues, specify the encoding as "cp775".
df_sportify = pd.read_csv("spotify-2023.csv", encoding = "cp775")

In [60]:
# To quickly examine the data, show the first few rows of the DataFrame "df_sportify."
df_sportify.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [61]:
# Provide a succinct overview of the DataFrame 'df_sportify'. This covers memory usage, non-null counts, and data types.
df_sportify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    int64 
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

In [62]:
# This helps determine the existence and amount of missing data in the dataset by checking and displaying the sum of the missing values in each column of the DataFrame "df_sportify."
df_sportify.isnull().sum()

track_name               0
artist(s)_name           0
artist_count             0
released_year            0
released_month           0
released_day             0
in_spotify_playlists     0
in_spotify_charts        0
streams                  0
in_apple_playlists       0
in_apple_charts          0
in_deezer_playlists      0
in_deezer_charts         0
in_shazam_charts        50
bpm                      0
key                     95
mode                     0
danceability_%           0
valence_%                0
energy_%                 0
acousticness_%           0
instrumentalness_%       0
liveness_%               0
speechiness_%            0
dtype: int64

In [63]:
#Sort the DataFrame 'df_sportify' according to the 'key' column, then count how many times each group's 'in_spotify_playlists' appears.
count_sportify_play = df_sportify.groupby('key')['in_spotify_playlists'].count().reset_index()
# Use Plotly Express to create a bar plot that displays the total number of playlists according to the song keys.
plot1 = px.bar(count_sportify_play, x='key', y='in_spotify_playlists', title='Total playlist with respect to key of the songs')
# Use the previously defined Plotly Express object 'plot1' to display the generated plot.
plot1.show()

In [64]:
#Group the'mode' column of the DataFrame 'df_sportify', then add up the 'in_spotify_playlists' for every group.
sum_sportify_play = df_sportify.groupby('mode')['in_spotify_playlists'].sum().reset_index()
# Use Plotly Express to create a pie chart that displays the total number of playlists in relation to the song mode.
plot2 = px.pie(sum_sportify_play, values='in_spotify_playlists', names='mode', title='Total paylist with respect to mode')
# Display the pie chart
plot2.show()

In [65]:
# Compute the total of 'danceability_%' for each year by grouping the DataFrame 'df_sportify' by the'released_year' column.
#Using Plotly Express, create an area plot that shows the total amount of danceability relative to the year that the songs were released.
sum_dancebility = df_sportify.groupby('released_year')['danceability_%'].sum().reset_index()
plot3 = px.area(sum_dancebility, x='released_year', y='danceability_%', title='Total sum of the danceability of song with respect to year')
plot3.show()

In [66]:
#To find the total of'speechiness_%' for each key, group the DataFrame 'df_sportify' by the 'key' column.
# Using Plotly Express, create an area plot that shows the total sum of the speechiness percentage in relation to the song key.
sum_speechiness = df_sportify.groupby('key')['speechiness_%'].sum().reset_index()
plot4 = px.area(sum_speechiness, x='key', y='speechiness_%', title='Total sum of the speechiness percentage song with respect to key')
plot4.show()

In [67]:
# Count the instances of'streams' for each key by grouping the DataFrame 'df_sportify' by the 'key' column.
# 'Most Stream analysis': use Plotly Express to create a pie chart that shows the distribution of streams among various keys.
sum_sportify_stream = df_sportify.groupby('key')['streams'].count().reset_index()
plot5 = px.pie(sum_sportify_stream, values='streams', names='key', title='Most Stream analysis')
plot5.show()

In [68]:
# Group the DataFrame 'df_sportify' by the 'mode' column and count the occurrences of 'streams' for each mode.
# Create a pie chart using Plotly Express, displaying the distribution of streams among different modes in a 'Most stream by mode' analysis.
sum_sportify_stream1 = df_sportify.groupby('mode')['streams'].count().reset_index()
plot6 = px.pie(sum_sportify_stream1, values='streams', names='mode', hole=0.6, title='Most stream by mode')
plot6.show()

In [69]:
# Take specific columns out of the DataFrame 'df_sportify': 'artist(s)_name, track_name, in_deezer_playlists, in_shazam_charts, key'.
# Columns are dropped when the 'axis=1' parameter is used, and changes are applied directly to the original DataFrame when the 'inplace=True' option is selected.
df_sportify.drop(['artist(s)_name','track_name', 'in_deezer_playlists', 'in_shazam_charts', 'key'], axis=1, inplace=True)

In [70]:
# After removing certain columns, show the first few rows of the updated DataFrame "df_sportify"
df_sportify.head()

Unnamed: 0,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_charts,bpm,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,2,2023,7,14,553,147,141381703,43,263,10,125,Major,80,89,83,31,0,8,4
1,1,2023,3,23,1474,48,133716286,48,126,14,92,Major,71,61,74,7,0,10,4
2,1,2023,6,30,1397,113,140003974,94,207,14,138,Major,51,32,53,17,0,31,6
3,1,2019,8,23,7858,100,800840817,116,207,12,170,Major,55,58,72,11,0,11,15
4,1,2023,5,18,3133,50,303236322,84,133,15,144,Minor,65,23,80,14,63,11,6


In [71]:
# Show a brief summary of the updated DataFrame "df_sportify" # Data types, non-null counts, and memory usage following column removal are all included in this.
df_sportify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   artist_count          953 non-null    int64 
 1   released_year         953 non-null    int64 
 2   released_month        953 non-null    int64 
 3   released_day          953 non-null    int64 
 4   in_spotify_playlists  953 non-null    int64 
 5   in_spotify_charts     953 non-null    int64 
 6   streams               953 non-null    int64 
 7   in_apple_playlists    953 non-null    int64 
 8   in_apple_charts       953 non-null    int64 
 9   in_deezer_charts      953 non-null    int64 
 10  bpm                   953 non-null    int64 
 11  mode                  953 non-null    object
 12  danceability_%        953 non-null    int64 
 13  valence_%             953 non-null    int64 
 14  energy_%              953 non-null    int64 
 15  acousticness_%        953 non-null    in

In [72]:
# Convert the values in DataFrame 'df_sportify's'mode' column to binary representation:
#'Major' corresponds to 1 and 'Minor' to 0.
df_sportify['mode'] = df_sportify['mode'].map({'Major': 1, 'Minor': 0})

In [73]:
#Present a brief description of the DataFrame 'df_sportify' following the application of binary representation to the'mode' column.
df_sportify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   artist_count          953 non-null    int64
 1   released_year         953 non-null    int64
 2   released_month        953 non-null    int64
 3   released_day          953 non-null    int64
 4   in_spotify_playlists  953 non-null    int64
 5   in_spotify_charts     953 non-null    int64
 6   streams               953 non-null    int64
 7   in_apple_playlists    953 non-null    int64
 8   in_apple_charts       953 non-null    int64
 9   in_deezer_charts      953 non-null    int64
 10  bpm                   953 non-null    int64
 11  mode                  953 non-null    int64
 12  danceability_%        953 non-null    int64
 13  valence_%             953 non-null    int64
 14  energy_%              953 non-null    int64
 15  acousticness_%        953 non-null    int64
 16  instrume

In [74]:
# To evaluate the effects of earlier changes, check and show the total of missing values in each column of the DataFrame "df_sportify" .
df_sportify.isnull().sum()

artist_count            0
released_year           0
released_month          0
released_day            0
in_spotify_playlists    0
in_spotify_charts       0
streams                 0
in_apple_playlists      0
in_apple_charts         0
in_deezer_charts        0
bpm                     0
mode                    0
danceability_%          0
valence_%               0
energy_%                0
acousticness_%          0
instrumentalness_%      0
liveness_%              0
speechiness_%           0
dtype: int64

In [75]:
# Import the Scikit-Learn modules required for logistic regression modeling.
# Import functions to divide the dataset and assess the performance of the model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [76]:
# For modeling, keep the target variable (y) and the features (X) apart.
# Y contains the'mode' column, which represents the target variable, while
# X contains all other columns.
X = df_sportify.drop('mode', axis=1)
y = df_sportify['mode']

In [77]:
# Use train_test_split to divide the dataset into training and testing sets.
# X_train and y_train stand for the training features and target variable, and X_test and y_test for the testing features and variable.
# For reproducibility, a random seed (random_state=42) is utilized, and the test set size is set to 20%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# Create an instance of a logistic regression model.
# Use the training data (X_train and y_train) to train the model.
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [79]:
# Using the test data (X_test), make predictions using the trained Logistic Regression model.
lr_y_pred = lr_model.predict(X_test)

In [80]:
# Compute the Logistic Regression model's accuracy by contrasting the test set's actual and predicted values.
# Output the accuracy score with two decimal places rounded.
accuracy_lr = accuracy_score(y_test, lr_y_pred)
print('Accuracy of Logistic Regression is ', round(accuracy_lr,2))

Accuracy of Logistic Regression is  0.57


In [81]:
# Produce a classification report detailing the effectiveness of the Logistic Regression model on the test set.
# Print the report on classification.
lr_report = classification_report(y_test, lr_y_pred)

print('Classification Report:\n', lr_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        83
           1       0.57      1.00      0.72       108

    accuracy                           0.57       191
   macro avg       0.28      0.50      0.36       191
weighted avg       0.32      0.57      0.41       191

