In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("maharshipandya/-spotify-tracks-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/maharshipandya/-spotify-tracks-dataset?dataset_version_number=1...


100%|█████████████████████████████████████████████████████████████████████████████| 8.17M/8.17M [00:01<00:00, 6.40MB/s]

Extracting files...





Path to dataset files: C:\Users\wooll\.cache\kagglehub\datasets\maharshipandya\-spotify-tracks-dataset\versions\1


# 🎹 Spotify Tracks Dataset
A dataset of Spotify songs with different genres and their audio features
> https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset/data  
> https://www.kaggle.com/code/duongtruongbinh/spotify-eda-simple-songs-recommendation

## EDA

# 1. Basic information

In [2]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from scipy.spatial import distance 

In [None]:
# 데이터 로드
df = pd.read_csv(r"C:\Users\wooll\OneDrive\문서\GitHub\-\dataset\spotify-tracks-dataset.csv", index_col=0)
df.sample(5)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
34168,3upIaTAwzeD5ijPJe44ceD,Fossils,Fossils 4,Khnoro Aamar Fossil,45,332403,False,0.403,0.77,8,-6.787,0,0.0434,0.195,0.0,0.151,0.428,140.616,4,folk
7407,7qcrcJgbl5WCiLVpPNCPqd,Punch Brothers,Antifogmatic,Alex,23,299973,False,0.673,0.122,0,-16.406,1,0.0344,0.837,0.000108,0.115,0.35,84.289,4,bluegrass
81913,79hQvdTHNBkq2fp2ZrM8b2,Javed-Mohsin;Arijit Singh;Shreya Ghoshal,Jalebi (Original Motion Picture Soundtrack),Pal,65,247286,False,0.495,0.498,7,-7.023,0,0.0337,0.701,0.0,0.0706,0.32,133.992,4,pop
45989,7wlMBq2mjXlqxSmonLSk2i,Los Apus,No Debí Conocerte,No Me Vuelvo a Enamorar,21,243809,False,0.812,0.615,1,-5.769,0,0.0456,0.723,0.000894,0.106,0.815,102.05,4,guitar
31131,3Eo7rOfvXRzGq68b9PVfZE,Wisin & Yandel,Esto No Se Le Dedica A Cualquiera,Estoy Enamorado,2,271720,False,0.78,0.725,6,-6.032,1,0.0503,0.288,0.0,0.0851,0.431,100.92,4,electro


In [6]:
# rows, cols 확인
ncols, nrows = df.shape
print(f'Dataset has {ncols} rows and {nrows} columns')

Dataset has 114000 rows and 20 columns


In [None]:
# 중복 행 확인
# 중복된 행 수 계산
duplicated_rows = df.duplicated().sum()

# 중복 행 없다면:
if duplicated_rows == 0:
    print('There are 0 rows that are duplicated, which means each row in the DataFrame is unique.')
    print('So that we do not need to continue processing duplicate lines')
# 중복 행 있다면:
else:
    print(f'There are {duplicated_rows} rows that are duplicated so we need to drop those {duplicated_rows} rows')
    df = df.drop_duplicates() # 중복 행 제거한 새로운 데이터프레임으로 df 갱신
    print(f'After drop duplicated rows, there are {df.shape[0]} rows left') # 중복 제거 후 남은 행 수 출력

There are 450 rows that are duplicated so we need to drop those 450 rows
After drop duplicated rows, there are 113550 rows left


In [8]:
df.dtypes.to_frame('Data Type')

Unnamed: 0,Data Type
track_id,object
artists,object
album_name,object
track_name,object
popularity,int64
duration_ms,int64
explicit,bool
danceability,float64
energy,float64
key,int64


In [9]:
def open_object_dtype(s):
    # 빈 집합 생성
    # 각 값의 type() 호출 -> type 목록 생성
    dtypes = set() 
    # type들을 집합에 업데이트(중복 제거)
    dtypes.update(s.apply(type))
    return dtypes # 해당 Series가 포함하고 있는 고유한 파이썬 타입들의 집합 반환

# 'object' type 컬럼의 이름만 뽑아서 리스트로 저장
obj_cols = df.select_dtypes(include='object').columns
df[obj_cols].apply(open_object_dtype, axis=0).to_frame('Data Type')

Unnamed: 0,Data Type
track_id,{<class 'str'>}
artists,"{<class 'float'>, <class 'str'>}"
album_name,"{<class 'float'>, <class 'str'>}"
track_name,"{<class 'float'>, <class 'str'>}"
track_genre,{<class 'str'>}


In [10]:
# 결측치 확인 
# Missing values in each row
missing_values_per_row = df.isnull().sum(axis=1)
count_per_missing_value = missing_values_per_row.value_counts().sort_index()

# Print the results
for missing, rows in count_per_missing_value.items():
    print(f'{rows} row(s) have {missing} missing values')

total_rows_with_missing_values = (df.isnull().any(axis=1)).sum()
print(f'Total number of rows with missing values: {total_rows_with_missing_values}')

113549 row(s) have 0 missing values
1 row(s) have 3 missing values
Total number of rows with missing values: 1


# 2. Data Distribution

## 2.1 Numerical columns

In [12]:
numerical_cols = df[df.columns[(df.dtypes == 'float64') | (df.dtypes == 'int64')]]
numerical_cols.shape

(113550, 14)

In [13]:
numerical_cols.sample(5)

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
17818,33,142602,0.725,0.529,11,-10.434,0,0.0566,0.0154,0.0268,0.268,0.321,82.512,4
5730,50,156800,0.625,0.833,3,-9.027,1,0.0391,0.901,0.891,0.116,0.243,74.941,4
98216,29,450773,0.348,0.418,10,-8.86,1,0.0348,0.536,0.0,0.148,0.295,145.891,4
83871,49,201578,0.788,0.824,11,-4.85,0,0.0761,0.00507,0.00752,0.183,0.7,111.988,4
85647,37,255640,0.488,0.91,5,-3.052,1,0.0496,0.00657,0.0,0.212,0.428,130.133,4


In [14]:
numerical_cols.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113550 entries, 0 to 113999
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        113550 non-null  int64  
 1   duration_ms       113550 non-null  int64  
 2   danceability      113550 non-null  float64
 3   energy            113550 non-null  float64
 4   key               113550 non-null  int64  
 5   loudness          113550 non-null  float64
 6   mode              113550 non-null  int64  
 7   speechiness       113550 non-null  float64
 8   acousticness      113550 non-null  float64
 9   instrumentalness  113550 non-null  float64
 10  liveness          113550 non-null  float64
 11  valence           113550 non-null  float64
 12  tempo             113550 non-null  float64
 13  time_signature    113550 non-null  int64  
dtypes: float64(9), int64(5)
memory usage: 13.0 MB


In [15]:
dist_numerical_cols = numerical_cols.describe().T[['min', 'max']]
dist_numerical_cols['Missing Values'] = numerical_cols.isnull().sum()
dist_numerical_cols['Missing Percentage'] = (numerical_cols.isnull().mean() * 100).round(2)
# The number of -1 values in the 'key' column
dist_numerical_cols.loc['key', 'Missing Values'] = (df['key'] == -1).sum()
dist_numerical_cols

Unnamed: 0,min,max,Missing Values,Missing Percentage
popularity,0.0,100.0,0,0.0
duration_ms,0.0,5237295.0,0,0.0
danceability,0.0,0.985,0,0.0
energy,0.0,1.0,0,0.0
key,0.0,11.0,0,0.0
loudness,-49.531,4.532,0,0.0
mode,0.0,1.0,0,0.0
speechiness,0.0,0.965,0,0.0
acousticness,0.0,0.996,0,0.0
instrumentalness,0.0,1.0,0,0.0


In [None]:
numerical_cols.describe()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0,113550.0
mean,33.324139,228079.4,0.567031,0.64209,5.309467,-8.243419,0.63786,0.084674,0.314067,0.155702,0.213611,0.474207,122.175888,3.904218
std,22.283976,106414.8,0.173408,0.251052,3.560134,5.011401,0.480621,0.105761,0.331907,0.309216,0.190461,0.259204,29.972861,0.432115
min,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.0,174180.2,0.456,0.473,2.0,-9.99775,0.0,0.0359,0.0168,0.0,0.098,0.26,99.2965,4.0
50%,35.0,213000.0,0.58,0.685,5.0,-6.997,1.0,0.0489,0.168,4.1e-05,0.132,0.464,122.02,4.0
75%,50.0,261587.8,0.695,0.854,8.0,-5.001,1.0,0.0845,0.596,0.048675,0.273,0.683,140.07375,4.0
max,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0


In [1]:
sns.set_style('darkgrid')
sns.set(rc={"axes.facecolor":"#F2EAC5","figure.facecolor":"#F2EAC5"})
numerical_cols.hist(figsize=(20,15), bins=30, xlabelsize=8, ylabelsize=8)
plt.tight_layout()
plt.show()

NameError: name 'sns' is not defined