In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


<div class="alert alert-block alert-success" style='font-size:25px'>
<b>Hi!</b> If you like this notebook, an <font color="Blue"><b>Upvote</b></font> would be great ! 😊 <br>
    Please <font color="BLUE"><b>comments</b></font> me your <font color="BLUE"><b>feedbacks</b></font> to help me improve myself.  <br>  
     Thanks for your time.
</div>

In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
%matplotlib inline 
import matplotlib as mpl
from matplotlib.ticker import NullFormatter

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

print(plt.style.available)
mpl.style.use(['ggplot'])

# <div style='color:blue'>Data Pre-Processing<div>

In [None]:
df  = pd.read_csv('/kaggle/input/spotify-and-youtube/Spotify_Youtube.csv',index_col=0)
df.head(5)

In [None]:
df.describe()

## Checking the dataset for missing values and etc

In [None]:
missing = []
unique = []
types = []
variables = []
count = []

for item in df.columns:
    variables.append(item)
    missing.append(df[item].isnull().sum())
    unique.append(df[item].nunique())
    types.append(df[item].dtypes)
    count.append(len(df[item]))
    
output = pd.DataFrame({
    'variable': variables, 
    'dtype': types,
    'count': count,
    'unique': unique,
    'missing': missing, 
   
})   
output.sort_values("missing",ascending=False).reset_index(drop=True)

In [None]:
# Lets see the statistical numerical variables
df.describe().T

In [None]:
# To make the dataset more relaiable based on the real world
# I decide to drop all missing values
df.dropna(axis=0,inplace=True)


In [None]:
# Also Checking for the duplicate values
df.duplicated().sum()

# Exploratory Data Analysis (EDA)

## Lets see the total of album types and the views

In [None]:
def formatter(x, pos):
    return str(round(x / 1e9, 1))
def millformatter(x, pos):
    return str(round(x / 1e6, 1)) 

fig, (ax1,ax2) = plt.subplots(nrows=1,ncols=2,sharex=True,figsize=(20,10))

sns.countplot(df,x='Album_type',ax=ax1)
ax1.set_title("Total of songs for each album types")
ax1.set_ylabel("Total of songs")
ax1.set_xlabel("Album types")


most_view_album = df.groupby('Album_type').sum().sort_values('Views',ascending=False).reset_index()
sns.barplot(most_view_album,y='Views',x='Album_type',ax=ax2)

ax2.set_title("Total view based on album types")
ax2.set_ylabel("Total Views in Billions")
ax2.set_xlabel("Album types")
ax2.yaxis.set_major_formatter(formatter)
ax2.yaxis.set_minor_formatter(NullFormatter())

fig.suptitle("Total album types")

## Which track have the most view and likes on Youtube and stream on Spotify

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(nrows=3,ncols=1,figsize= (20,20))
fig.subplots_adjust(hspace=0.5)
most_view_track = df.groupby('Track').sum().sort_values('Views',ascending=False).reset_index().head(10)
sns.barplot(most_view_track,x='Views',y='Track',ax=ax1)

ax1.set_title("Top 10 of most viewed track",fontsize=30)
ax1.set_xlabel("Views in Billion")
ax1.xaxis.set_major_formatter(formatter)
ax1.yaxis.set_minor_formatter(NullFormatter())


most_likes_track = df.groupby('Track').sum().sort_values('Likes',ascending=False).reset_index().head(10)
sns.barplot(most_likes_track,x='Likes',y='Track',palette='mako',ax=ax2)

ax2.set_title("Top 10 of most likes track",fontsize=30)
ax2.set_xlabel("Likes in Million")
ax2.xaxis.set_major_formatter(millformatter)
ax2.yaxis.set_minor_formatter(NullFormatter())

most_stream_track = df.groupby('Track').sum().sort_values('Stream',ascending=False).reset_index().head(10)
sns.barplot(most_stream_track,x='Stream',y='Track',palette='rocket',ax=ax3)

ax3.set_title("Top 10 of most Streams track",fontsize=30)
ax3.set_xlabel("Streams in Billion")
ax3.xaxis.set_major_formatter(formatter)
ax3.yaxis.set_minor_formatter(NullFormatter())

fig.suptitle("Top 10 of track",fontsize=40)

## Artist with the most stream and viewed 

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(nrows=3,ncols=1,figsize=(20,20))




most_popular_artist = df.groupby('Artist').sum()
most_popular_artist['TotalListened'] = most_popular_artist['Views'] +most_popular_artist['Stream']
most_popular_artist = most_popular_artist.sort_values('TotalListened',ascending=False).reset_index().head(10)

sns.barplot(most_popular_artist,y='Artist',x='TotalListened',ax=ax1)
ax1.xaxis.set_major_formatter(formatter)
ax1.xaxis.set_minor_formatter(NullFormatter())
ax1.set_xlabel("Total in Billions")
ax1.set_title("Total of viewed on both platform")

# Youtube

most_popular_artist_youtube = df.groupby('Artist').sum().sort_values('Views',ascending=False).reset_index().head(10)

sns.barplot(most_popular_artist_youtube,y='Artist',x='Views',palette='hls',ax=ax2)
ax2.xaxis.set_major_formatter(formatter)
ax2.xaxis.set_minor_formatter(NullFormatter())
ax2.set_xlabel("Total in Billions")
ax2.set_title("Total of viewed on Youtube")

# Spotify

most_popular_artist_spotify = df.groupby('Artist').sum().sort_values('Stream',ascending=False).reset_index().head(10)

sns.barplot(most_popular_artist_spotify,y='Artist',x='Stream',palette='mako',ax=ax3)
ax3.xaxis.set_major_formatter(formatter)
ax3.xaxis.set_minor_formatter(NullFormatter())
ax3.set_xlabel("Total in Billions")
ax3.set_title("Total of Stream on Spotify")

fig.suptitle("Artist with the most stream and views",fontsize=40)

We can see that the most track was listened is by Ed Sheeran on Both Platform
## So Lets see what most track that popular by Ed Sheeran

In [None]:
ed_sheeran = df.groupby(['Artist','Track']).sum().reset_index()
ed_sheeran = ed_sheeran[ed_sheeran['Artist'] == 'Ed Sheeran']
ed_sheeran['TotalListened'] = ed_sheeran.Views  + ed_sheeran.Stream
ed_sheeran = ed_sheeran.sort_values('TotalListened',ascending=False)

fig = sns.barplot(ed_sheeran,x='TotalListened',y='Track')
fig.xaxis.set_major_formatter(formatter)
fig.set_xlabel("Total listened on both platform in Billions")
fig.set_title("Most popular track by Ed Sheeran",fontsize=20)

## The most popular video with high of Danceability on Youtube
Danceability: describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.

In [None]:
danceability_track = df.groupby(['Track','Danceability']).sum().reset_index()
danceability_track = danceability_track[danceability_track['Danceability'] > 0.9]\
.sort_values('Views',ascending=False).head(10)

fig = sns.barplot(danceability_track,y='Track',x='Views')
fig.set_title("Most popular video with high of danceability on Youtube",fontsize=15)
fig.xaxis.set_major_formatter(formatter)
fig.set_xlabel("Total views in Billions")

## Lets see what the most viewed cheerfull track and sad track on Spotify

Valence : a measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

In [None]:
fig ,(ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize=(30,10))
fig.subplots_adjust(wspace=0.5)
# Valence with score above 0.8

happy_track = df.groupby(['Track','Valence']).sum().sort_values('Stream',ascending=False).reset_index()
happy_track = happy_track[happy_track['Valence']>0.8].head(10)

sns.barplot(happy_track,x='Stream',y='Track',ax=ax1)
ax1.set_title("Most viewed track with happy vibes",fontsize=20)
ax1.xaxis.set_major_formatter(formatter)
ax1.xaxis.set_minor_formatter(NullFormatter())
ax1.set_xlabel("Stream in Billion")

# Valecne with score below 0.2

sad_track = df.groupby(['Track','Valence']).sum().sort_values('Stream',ascending=False).reset_index()
sad_track = sad_track[sad_track['Valence']<0.2].head(10)

sns.barplot(sad_track,x='Stream',y='Track',ax=ax2)
ax2.set_title("Most viewed track with sad vibes",fontsize=20)
ax2.xaxis.set_major_formatter(formatter)
ax2.xaxis.set_minor_formatter(NullFormatter())
ax2.set_xlabel("Stream in Billion")

fig.suptitle("Most viewed track on happy and sad vibes",fontsize=30)

## Lets see relationship between variable views and likes on youtube videos

In [None]:
fig = sns.scatterplot(df,x='Views',y='Likes',hue='Album_type')
fig.yaxis.set_major_formatter(millformatter)
fig.yaxis.set_minor_formatter(NullFormatter())
fig.set_ylabel("Likes in million")

fig.xaxis.set_major_formatter(formatter)
fig.xaxis.set_minor_formatter(NullFormatter())
fig.set_xlabel("Views in billion")
fig.set_title("Relationship between views and likes on youtube videos",fontsize=20)

## Lets see relationship between variable views on youtube and stream on Spotify

In [None]:
fig = sns.scatterplot(df,x='Views',y='Stream',hue='Album_type')
fig.yaxis.set_major_formatter(formatter)
fig.yaxis.set_minor_formatter(NullFormatter())
fig.set_ylabel("Stream in billions")

fig.xaxis.set_major_formatter(formatter)
fig.xaxis.set_minor_formatter(NullFormatter())
fig.set_xlabel("Views in billions")
fig.set_title("Relationship between views on both platforms",fontsize=20)

## Lets see the relationship between duration and views

In [None]:
df_duration = df 
df_duration['Duration_ms'] = (df_duration['Duration_ms']/1000)/60
# Mili second to  minutes

In [None]:
fig = sns.scatterplot(df_duration,y='Duration_ms',x='Views')
fig.set_title("Relationship between duration views",fontsize=15)

# Model to predict views

## Lets see what features that have good relationship with variable stream,views and likes

In [None]:
features = """Danceability
Energy
Key
Loudness
Speechiness
Acousticness
Instrumentalness
Liveness
Valence
Tempo
Duration_ms
Stream
Views
Likes"""
features = features.split('\n')
features

In [None]:
df_feat = df[features]
corr_df = df_feat.corr()
fig = plt.figure(figsize=(20,8))
sns.heatmap(corr_df,annot=True,fmt='.2f')

In [None]:
abs(corr_df['Views']).sort_values(ascending=False)[3:]

In [None]:
abs(corr_df['Likes']).sort_values(ascending=False)[3:]

In [None]:
abs(corr_df['Stream']).sort_values(ascending=False)[3:]

## Here im just creating simple model of XGBoost because from the correlation matrix itself. It can be seen that the feature cant be used for this prediction

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42,subsample=1,min_child_weight=5,
                            max_depth=5,gamma=5,colsample_bytree=0.6)

In [None]:
from sklearn import preprocessing
X = df_feat.drop(columns=['Views','Stream','Likes'],axis=1)
y = df_feat['Views']
std_scaler = preprocessing.StandardScaler()

X_std = std_scaler.fit_transform(X)
# y_std = std_scaler.fit_transform(y)

scores = cross_val_score(xgb_model, X_std, y, scoring="r2", cv=5)

scores