<a href="https://www.kaggle.com/code/klaidenx/spotify-yout-eda?scriptVersionId=130031003" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#Data Manipulation
import numpy as np
import pandas as pd 
#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

%matplotlib inline

In [None]:
class COLORS:
    BLUE="#A6D0DD"
    RED="#FF6969"
    BEIGE="#FFD3B0"
    BEI="#FFF9DE"
    DARKB="#3C486B"
    YELLOW="#F9D949"
    RRED="#F45050"
    ORANGE="#FF6000"

In [None]:
#Reading Data
df_ys=pd.read_csv("/kaggle/input/spotify-and-youtube/Spotify_Youtube.csv",index_col="Unnamed: 0")

## Information About The Data

In [None]:
#Counting Columns
columns=df_ys.columns[1:]
print("There are {} Columns in The Dataset Which Are {}".format(columns.nunique(),[_ for _ in columns]))

In [None]:
#Drop The Url Columns 
drop_cols=["Url_spotify","Uri","Url_youtube","Description","Title"]
df_ys.drop(columns=drop_cols,inplace=True)
df_ys.reset_index()

In [None]:
df_ys.info()

In [None]:
#Highest Occurence Values
high_cols=df_ys.select_dtypes(include="float64").columns.to_list()
high_occ=df_ys[high_cols].value_counts().idxmax()
for occ,high in tqdm(zip(high_cols,high_occ)):
    print("Most Occuring Value In {} And It's {}".format(occ,high))

In [None]:
#Missing Values
missing_values=df_ys.isnull().sum().to_list()
print("There Is A Total Of {} Missing Variables"\
      .format(df_ys.isnull().sum().sum()))

#Only Count The Missing Variables
for val,cat in tqdm(zip(missing_values,df_ys.columns)):
    if val > 0:
        print("{} Has {} Missing Variables".format(cat,val))

In [None]:
#Drop The Columns That Only Have Two Missing Values
num_cols_drop=[col for col in df_ys.columns if df_ys[col].isna().sum() == 2]
df_ys.dropna(subset=num_cols_drop,inplace=True)
#Fill Numerical Values
num_cols_fill=["Views","Likes","Comments","Stream"]
df_ys[num_cols_fill]=df_ys[num_cols_fill].fillna(method="ffill",axis=0)
#Fill Categorical Values
cat_cols_fill=["Channel","Licensed","official_video"]
df_ys[cat_cols_fill]=df_ys[cat_cols_fill].fillna(method="ffill",axis=0)

In [None]:
df_ys["Liveness"]

In [None]:
#Correcting The Data Type Of The Columns
df_ys['Likes'] = pd.to_numeric(df_ys['Likes'], errors='coerce')
df_ys['Comments'] = pd.to_numeric(df_ys['Comments'], errors='coerce')
df_ys['Views'] = pd.to_numeric(df_ys['Views'], errors='coerce')
df_ys['Stream'] = pd.to_numeric(df_ys['Stream'], errors='coerce')

In [None]:
df_ys.describe().T.style.background_gradient(cmap='YlOrRd',axis=0,low=.25,high=.95)

In [None]:
df_ys.shape
train_df=df_ys.iloc[:int(np.round(df_ys.shape[0]*.85)),:]
test_df=df_ys.iloc[:int(np.round(df_ys.shape[0]*.15)),:]
print(f"There are {train_df.shape[0]} Training Data And {test_df.shape[0]} Testing Data")

## EDA

In [None]:
fig,axs=plt.subplots(2,3,figsize=(15,5))

sns.histplot(ax=axs[0,0],data=train_df,x="Danceability",kde=True,color="#FC4F00")
axs[0,0].set_title("Danceability Distribution")
axs[0,0].set_xlabel("Danceability Levels")
axs[0,0].set_ylabel("Values Count")
axs[0,0].grid(True)

sns.histplot(ax=axs[0,1],data=train_df,x="Energy",kde=True,color="#F79540")
axs[0,1].set_title("Energy Distribution")
axs[0,1].set_xlabel("Intensity Levels")
axs[0,1].set_ylabel("Values Count")
axs[0,1].grid(True)

sns.histplot(ax=axs[0][2],data=train_df,x="Valence",kde=True,color="#FF6969")
axs[0,2].set_title("Audience Distribution")
axs[0,2].set_xlabel("Audience")
axs[0,2].set_ylabel("Values Count")
axs[0,2].grid(True)

sns.histplot(ax=axs[1,0],data=train_df,x="Acousticness",kde=True,color="#A6D0DD")
axs[1,0].set_title("Acousticness Distribution")
axs[1,0].set_xlabel("Acousticness")
axs[1,0].set_ylabel("Values Count")
axs[1,0].grid(True)

sns.histplot(ax=axs[1,1],data=train_df,x="Liveness",kde=True,color="#F6BA6F")
axs[1,1].set_title("Liveness Distribution")
axs[1,1].set_xlabel("Audience Presence")
axs[1,1].set_ylabel("Values Count")
axs[1,1].grid(True)

sns.histplot(ax=axs[1,2],data=train_df,x="Loudness",kde=True,color="#05BFDB")
axs[1,2].set_title("Loudness Distribution")
axs[1,2].set_xlabel("dB")
axs[1,2].set_ylabel("Values Count")
axs[1,2].grid(True)

plt.tight_layout(pad=.5)

In [None]:
fig,axs=plt.subplots(1,3,figsize=(19,4))

channel_views=train_df.groupby(["Channel"]).agg({"Views":"median"})
channel_views.reset_index(inplace=True)

sns.barplot(ax=axs[0],data=channel_views,x=channel_views["Views"].value_counts().index[:5]\
            ,y=channel_views["Channel"].value_counts().index[:5],width=.4)
axs[0].set_title("Five Viewed Channels")
axs[0].set_xlabel("View Count")
axs[0].set_ylabel("Names")
axs[0].grid(True)

sns.barplot(ax=axs[1],data=train_df,x=train_df["Stream"].value_counts().index[:5]\
            ,y=channel_views["Channel"].value_counts().index[:5],width=.4)
axs[1].set_title("Five Streamed Channels")
axs[1].set_xlabel("Stream Count")
axs[1].set_ylabel("Names")
axs[1].grid(True)


sns.barplot(ax=axs[2],data=train_df,x=train_df["Comments"].value_counts().index[:5]\
            ,y=channel_views["Channel"].value_counts().index[:5],width=.4)
axs[2].set_title("Five Commented On Channels")
axs[2].set_xlabel("Comments Count")
axs[2].set_ylabel("Names")
axs[2].grid(True)

plt.tight_layout(pad=0.8)

In [None]:
fig,axs=plt.subplots(3,3,figsize=(15,8),sharey=True)

sns.regplot(ax=axs[0,0],data=train_df,x=train_df["Danceability"][:150],y=train_df["Views"][:150],color=COLORS.RED)
axs[0,0].set_title("Danceability On Views")
axs[0,0].set_ylabel("Views Count")
axs[0,0].grid(True)

sns.regplot(ax=axs[0,1],data=train_df,x=train_df["Energy"][:150],y=train_df["Views"][:150],color=COLORS.DARKB)
axs[0,1].set_title("Energy On Views")
axs[0,1].grid(True)

sns.regplot(ax=axs[0,2],data=train_df,x=train_df["Speechiness"][:150],y=train_df["Views"][:150],color=COLORS.ORANGE)
axs[0,2].set_title("Speechiness On Views")
axs[0,2].grid(True)

sns.regplot(ax=axs[1,0],data=train_df,x=train_df["Instrumentalness"][:150],y=train_df["Views"][:150],color=COLORS.RED)
axs[1,0].set_title("Instrumentalness On Views")
axs[1,0].grid(True)

sns.regplot(ax=axs[1,1],data=train_df,x=train_df["Liveness"][:150],y=train_df["Views"][:150],color=COLORS.DARKB)
axs[1,1].set_title("Liveness On Views")
axs[1,1].grid(True)

sns.regplot(ax=axs[1,2],data=train_df,x=train_df["Valence"][:150],y=train_df["Views"][:150],color=COLORS.ORANGE)
axs[1,2].set_title("Valence On Views")
axs[1,2].grid(True)

sns.regplot(ax=axs[2,0],data=train_df,x=train_df["Tempo"][:150],y=train_df["Views"][:150],color=COLORS.YELLOW)
axs[2,0].set_title("Tempo On Views")
axs[2,0].grid(True)

sns.regplot(ax=axs[2,1],data=train_df,x=train_df["Duration_ms"][:150],y=train_df["Views"][:150],color=COLORS.RRED)
axs[2,1].set_title("Duration On Views")
axs[2,1].grid(True)

sns.regplot(ax=axs[2,2],data=train_df,x=train_df["Acousticness"][:150],y=train_df["Views"][:150])
axs[2,2].set_title("Acousticness On Views")
axs[2,2].grid(True)

plt.tight_layout(pad=.8)

In [None]:
fig,axs=plt.subplots(3,3,figsize=(20,8))

sns.boxplot(ax=axs[0,0],data=train_df,x=train_df["Comments"][:200],width=.4,color=COLORS.RED)
sns.boxplot(ax=axs[0,1],data=train_df,x=train_df["Comments"][201:400],width=.4,color=COLORS.RED)
sns.boxplot(ax=axs[0,2],data=train_df,x=train_df["Comments"][401:600],width=.4,color=COLORS.RED)


sns.boxplot(ax=axs[1,0],data=train_df,x=train_df["Likes"][:200],width=.4,color=COLORS.DARKB)
sns.boxplot(ax=axs[1,1],data=train_df,x=train_df["Likes"][201:400],width=.4,color=COLORS.DARKB)
sns.boxplot(ax=axs[1,2],data=train_df,x=train_df["Likes"][401:600],width=.4,color=COLORS.DARKB)

sns.boxplot(ax=axs[2,0],data=train_df,x=train_df["Views"][:200],width=.4,color=COLORS.ORANGE)
sns.boxplot(ax=axs[2,1],data=train_df,x=train_df["Views"][201:400],width=.4,color=COLORS.ORANGE)
sns.boxplot(ax=axs[2,2],data=train_df,x=train_df["Views"][401:600],width=.4,color=COLORS.ORANGE)


plt.tight_layout(pad=.8)