In [1]:
import plotly.express as px
import pandas as pd

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [2]:
# Load the datasets
features_df = pd.read_csv("/Users/user/project/github/spotify/Features.csv")
streams_df = pd.read_csv("/Users/user/project/github/spotify/Streams.csv")

In [3]:
features_df.head()

Unnamed: 0,id,name,duration,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,danceability
0,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,3.33,0.73,1,-5.934,1,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,0.514
1,7qiZfU4dY1lWllzX7mPBI3,Shape of You,3.9,0.652,1,-3.183,0,0.0802,0.581,0.0,0.0931,0.931,95.977,0.825
2,2XU0oxnq2qxCpomAAuJY8K,Dance Monkey,3.49,0.588,6,-6.4,0,0.0924,0.692,0.000104,0.149,0.513,98.027,0.824
3,7qEHsqek33rTcFNT9PFqLf,Someone You Loved,3.04,0.405,1,-5.679,1,0.0319,0.751,0.0,0.105,0.446,109.891,0.501
4,0e7ipj03S05BNilyu5bRzt,Rockstar,3.64,0.52,5,-6.136,0,0.0712,0.124,7e-05,0.131,0.129,159.801,0.585


In [4]:
streams_df.head()

Unnamed: 0,Song,Artist,Streams (Billions),Release Date
0,Blinding Lights,The Weeknd,3.449,29-Nov-19
1,Shape of You,Ed Sheeran,3.398,06-Jan-17
2,Dance Monkey,Tones And I,2.77,10-May-19
3,Someone You Loved,Lewis Capaldi,2.68,08-Nov-18
4,Rockstar,Post Malone featuring 21 Savage,2.62,15-Sep-17


In [5]:
features_df.isna().sum()

id                  0
name                0
duration            0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
danceability        0
dtype: int64

In [6]:
streams_df.isna().sum()

Song                  0
Artist                0
Streams (Billions)    0
Release Date          0
dtype: int64

# **PLOTS**

In [7]:
fig = px.scatter(features_df, x="loudness", y="energy", hover_data=["name", "duration"])

# Add title and axis labels
fig.update_layout(title='Relationship between loudness and energy', 
                  xaxis_title='Loudness', yaxis_title='Energy')
fig.show()




In [8]:
fig = px.scatter(features_df, x="energy", y="speechiness", title="Energy vs. Speechiness",
                 hover_data=['energy', 'speechiness','name'])
fig.show()

In [9]:
fig = px.scatter_matrix(features_df, dimensions=["duration","loudness", "energy"], color="mode")

# Add title and axis labels
fig.update_layout(title='Relationship between loudness, energy, and duration')

fig.show()

In [10]:
fig = px.scatter_matrix(features_df, dimensions=["speechiness","acousticness","instrumentalness"], color="mode")

# Add title and axis labels
fig.update_layout(title='Relationship between speechiness, acousticness, and instrumentalness')

fig.show()

In [11]:
fig = px.scatter_matrix(features_df, dimensions=["valence","tempo","danceability"], color="mode")

# Add title and axis labels
fig.update_layout(title='Relationship between valence, tempo, and danceability')

fig.show()

In [12]:
import plotly.graph_objs as go

# Compute the correlation matrix
corr_matrix = features_df.corr()

# Create a heatmap plot
fig = go.Figure(data=go.Heatmap(
                   z=corr_matrix.values,
                   x=corr_matrix.index.values,
                   y=corr_matrix.columns.values,
                   colorscale='RdBu',
                   colorbar=dict(title='Correlation')
                 ))
fig.update_layout(title='Correlation Matrix of Audio Features')
fig.show()

In [13]:
fig = px.bar(streams_df, x="Artist", y="Streams (Billions)", color="Artist", 
             hover_data=["Song", "Release Date"], title="Top 10 Most Streamed Artists")
fig.show()

In [14]:
fig = px.box(features_df, y="speechiness", points="all", title="Speechiness Distribution")
fig.show()

In [15]:
streams_df.rename(columns = {'Song':'name'}, inplace = True)

df = pd.merge(features_df, streams_df, on='name')

fig = px.scatter(df, x='duration', y='Streams (Billions)', trendline='ols', title='Duration vs Streams')
fig.show()


In [16]:
fig = px.bar(df, x='Artist', y='Streams (Billions)', color='mode', title='Streams by Artist and Mode')
fig.show()


In [17]:
fig = px.density_heatmap(df, x='acousticness', y='energy', title='Acousticness vs Energy Density')
fig.show()

In [18]:
fig = px.box(features_df, y="duration", title="Box plot of Song Durations")
fig.show()

# **Building Prediction Models**
# Linear Regression Analysis


Data preprocessing


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define the input and target variables
X = df.drop(["id","mode","name", "Artist", "Streams (Billions)", "Release Date"], axis=1)
y = df["Streams (Billions)"]


In [26]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.4081881758515456
