In [1]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
# loading the data set onto the console
data = pd.read_csv("SpotifyFeatures.csv")

data.sample(9)

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
98430,Children’s Music,Phoenix,Too young,2THkQauDWMvJgXFGPY4iKB,60,0.0625,0.622,197973,0.75,0.000133,A,0.127,-6.367,Minor,0.0445,141.324,4/4,0.905
145045,Indie,Majid Jordan,King City,1X1y6CQmAejhN8oeFtouje,52,0.193,0.55,201467,0.578,0.000104,F,0.0904,-8.568,Major,0.036,104.495,4/4,0.0659
194053,Movie,Fabien Nataf,Your Day,26rKtF0MTxADp59mVbxkyK,10,0.664,0.547,170382,0.882,1.4e-05,G#,0.0861,-5.606,Minor,0.0511,80.043,4/4,0.461
55245,R&B,Mac Ayres,Show Me,4F1a462vK4YzOd75GR3CX6,58,0.742,0.607,200454,0.471,0.0201,B,0.16,-9.612,Major,0.0465,171.704,4/4,0.455
219587,World,Michael W. Smith,Freedom Battle,44mIF71vUf7grnTmDMGyvx,25,0.75,0.306,273400,0.192,0.896,C,0.0975,-15.141,Major,0.0457,126.024,3/4,0.0386
91124,Hip-Hop,City Morgue,Caligula,1hflocGENeOhHRN2jsFTyS,54,0.000779,0.614,143987,0.738,0.0,A,0.338,-7.264,Major,0.0469,134.919,4/4,0.171
202108,Soundtrack,Hans Zimmer,Journey To The Line - Live / From The Thin Red...,1fh2q9aDCsaGwMJV0NKbAN,41,0.279,0.464,397533,0.2,0.883,C,0.697,-19.223,Major,0.0411,119.986,4/4,0.0305
65895,Folk,Linda Ronstadt,"Ooh Baby, Baby",4FLEKG82bHeT2olTM8E2Fy,40,0.771,0.46,201800,0.264,9.9e-05,G,0.0962,-9.492,Major,0.0329,166.446,3/4,0.337
191080,Ska,The Briggs,Blacklist,7cB4ePdhZYJh9dltyB3VeF,16,0.00712,0.635,194920,0.973,0.0,B,0.378,-3.032,Major,0.0525,103.162,4/4,0.719


#### Data Exploration

In [3]:
# dimensionality of the data
data.shape

(232725, 18)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232725 entries, 0 to 232724
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             232725 non-null  object 
 1   artist_name       232725 non-null  object 
 2   track_name        232725 non-null  object 
 3   track_id          232725 non-null  object 
 4   popularity        232725 non-null  int64  
 5   acousticness      232725 non-null  float64
 6   danceability      232725 non-null  float64
 7   duration_ms       232725 non-null  int64  
 8   energy            232725 non-null  float64
 9   instrumentalness  232725 non-null  float64
 10  key               232725 non-null  object 
 11  liveness          232725 non-null  float64
 12  loudness          232725 non-null  float64
 13  mode              232725 non-null  object 
 14  speechiness       232725 non-null  float64
 15  tempo             232725 non-null  float64
 16  time_signature    23

In [5]:
data.head().T

Unnamed: 0,0,1,2,3,4
genre,Movie,Movie,Movie,Movie,Movie
artist_name,Henri Salvador,Martin & les fées,Joseph Williams,Henri Salvador,Fabien Nataf
track_name,C'est beau de faire un Show,Perdu d'avance (par Gad Elmaleh),Don't Let Me Be Lonely Tonight,Dis-moi Monsieur Gordon Cooper,Ouverture
track_id,0BRjO6ga9RKCKjfDqeFgWV,0BjC1NfoEOOusryehmNudP,0CoSDzoNIKCRs124s9uTVy,0Gc6TVm52BwZD07Ki6tIvf,0IuslXpMROHdEPvSl1fTQK
popularity,0,1,3,0,4
acousticness,0.611,0.246,0.952,0.703,0.95
danceability,0.389,0.59,0.663,0.24,0.331
duration_ms,99373,137373,170267,152427,82625
energy,0.91,0.737,0.131,0.326,0.225
instrumentalness,0.0,0.0,0.0,0.0,0.123


In [6]:
# identifying pressence of null values in percentage form
data.isnull().sum()*100/len(data)

genre               0.0
artist_name         0.0
track_name          0.0
track_id            0.0
popularity          0.0
acousticness        0.0
danceability        0.0
duration_ms         0.0
energy              0.0
instrumentalness    0.0
key                 0.0
liveness            0.0
loudness            0.0
mode                0.0
speechiness         0.0
tempo               0.0
time_signature      0.0
valence             0.0
dtype: float64

In [7]:
# exploring the pressence of duplicates
data.duplicated().sum()

0

In [8]:
# number of unique values
for column in data.select_dtypes("object"):
    print(f"The number of unique values in {column} column : {data[column].nunique()}")
    print("="*40)

The number of unique values in genre column : 27
The number of unique values in artist_name column : 14564
The number of unique values in track_name column : 148615
The number of unique values in track_id column : 176774
The number of unique values in key column : 12
The number of unique values in mode column : 2
The number of unique values in time_signature column : 5


In [9]:
data.columns

Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

In [10]:
# dropping the track name and the track id columns
data.drop(['track_name', 'track_id'], axis = 1, inplace = True)

In [11]:
data.time_signature.unique()

array(['4/4', '5/4', '3/4', '1/4', '0/4'], dtype=object)

In [12]:
# mapping of time signatures to understandable categories
time_signature_mapping = {
    '4/4': 'simple',
    '5/4': 'complex',
    '3/4': 'simple',
    '1/4': 'complex',
    '0/4': 'complex'
}

# Function to categorize time signatures
def categorize_time_signature(signature):
    return time_signature_mapping.get(signature, 'unknown')

# Apply the mapping function to create a new "category" column
data['time_signature'] = data['time_signature'].apply(categorize_time_signature)


In [13]:
# the value counts of each object column
for column in data.select_dtypes("object"):
    print(f"The Value Counts in {column} column :\n {data[column].value_counts().head(20)}")
    print("="*40)

The Value Counts in genre column :
 Comedy              9681
Soundtrack          9646
Indie               9543
Jazz                9441
Pop                 9386
Electronic          9377
Children’s Music    9353
Folk                9299
Hip-Hop             9295
Rock                9272
Alternative         9263
Classical           9256
Rap                 9232
World               9096
Soul                9089
Blues               9023
R&B                 8992
Anime               8936
Reggaeton           8927
Ska                 8874
Name: genre, dtype: int64
The Value Counts in artist_name column :
 Giuseppe Verdi             1394
Giacomo Puccini            1137
Kimbo Children's Music      971
Nobuo Uematsu               825
Richard Wagner              804
Wolfgang Amadeus Mozart     800
Randy Newman                757
Georges Bizet               701
Juice Music                 684
Johann Sebastian Bach       632
Ludwig van Beethoven        596
Hans Zimmer                 559
Gioachino Ro

In [14]:
# statistical summary of teh data
data.describe()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0
mean,41.127502,0.36856,0.554364,235122.3,0.570958,0.148301,0.215009,-9.569885,0.120765,117.666585,0.454917
std,18.189948,0.354768,0.185608,118935.9,0.263456,0.302768,0.198273,5.998204,0.185518,30.898907,0.260065
min,0.0,0.0,0.0569,15387.0,2e-05,0.0,0.00967,-52.457,0.0222,30.379,0.0
25%,29.0,0.0376,0.435,182857.0,0.385,0.0,0.0974,-11.771,0.0367,92.959,0.237
50%,43.0,0.232,0.571,220427.0,0.605,4.4e-05,0.128,-7.762,0.0501,115.778,0.444
75%,55.0,0.722,0.692,265768.0,0.787,0.0358,0.264,-5.501,0.105,139.054,0.66
max,100.0,0.996,0.989,5552917.0,0.999,0.999,1.0,3.744,0.967,242.903,1.0


In [15]:
data.sample(5)

Unnamed: 0,genre,artist_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
12537,Alternative,Lucky Daye,43,0.229,0.799,257557,0.564,0.000127,G,0.0954,-7.197,Minor,0.296,97.072,simple,0.677
131827,Reggae,Steel Pulse,40,0.112,0.74,249093,0.695,0.0,G,0.126,-10.001,Major,0.15,164.436,simple,0.92
24326,Electronic,Getter,40,0.226,0.401,168312,0.77,2e-06,E,0.0749,-4.676,Minor,0.0537,76.913,simple,0.0355
148618,Jazz,BluntOne,42,0.374,0.803,50385,0.247,0.846,C,0.119,-9.447,Major,0.0907,76.639,complex,0.694
174642,Comedy,Sinbad,13,0.896,0.475,382147,0.939,0.0,F#,0.957,-6.555,Major,0.961,65.691,simple,0.159


#### Modifying the artist name column

In [42]:
# creating a si,lar copy of data above
data1 = data.copy()

In [43]:
# removing leading and trailling spaces in the artist name column
data1["artist_name"] = data1["artist_name"].apply(lambda x: x.strip())

data1_value = data1["artist_name"].value_counts()

data1_value

Giuseppe Verdi            1394
Giacomo Puccini           1137
Kimbo Children's Music     971
Nobuo Uematsu              825
Richard Wagner             804
                          ... 
Freak Nasty                  1
Jackie DeShannon             1
The One After None           1
Mattias Bärjed               1
Candy Dulfer                 1
Name: artist_name, Length: 14564, dtype: int64

In [44]:
artist_less_than_twenty = data1_value[data1_value <= 20]

artist_less_than_twenty

SDIB                  20
FRENSHIP              20
Tom Waits             20
Gaullin               20
Alesso                20
                      ..
Freak Nasty            1
Jackie DeShannon       1
The One After None     1
Mattias Bärjed         1
Candy Dulfer           1
Name: artist_name, Length: 11656, dtype: int64

In [45]:
# assigning "others" to artist with less than 20 in value counts
data1["artist_name"] = data1["artist_name"].apply(lambda x: "others" if x in artist_less_than_thirty else x)

In [48]:
# observing the changes
data1["artist_name"].value_counts()

others                    55809
Giuseppe Verdi             1394
Giacomo Puccini            1137
Kimbo Children's Music      971
Nobuo Uematsu               825
                          ...  
Kai Wachi                    21
Brytiago                     21
Smile Empty Soul             21
Robby Benson                 21
Olivier Messiaen             21
Name: artist_name, Length: 2909, dtype: int64