In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans 
%matplotlib inline

## Step 1: Dataset Selection

In [50]:
df = pd.read_csv("train.csv")

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17996 entries, 0 to 17995
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist Name         17996 non-null  object 
 1   Track Name          17996 non-null  object 
 2   Popularity          17568 non-null  float64
 3   danceability        17996 non-null  float64
 4   energy              17996 non-null  float64
 5   key                 15982 non-null  float64
 6   loudness            17996 non-null  float64
 7   mode                17996 non-null  int64  
 8   speechiness         17996 non-null  float64
 9   acousticness        17996 non-null  float64
 10  instrumentalness    13619 non-null  float64
 11  liveness            17996 non-null  float64
 12  valence             17996 non-null  float64
 13  tempo               17996 non-null  float64
 14  duration_in min/ms  17996 non-null  float64
 15  time_signature      17996 non-null  int64  
 16  Clas

In [52]:
df.head()

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4,10
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486,0.000196,0.394,0.787,147.681,109667.0,4,6
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.0212,,0.122,0.569,107.033,173968.0,4,5
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.216,0.000169,0.0161,0.172,0.0918,199.06,229960.0,4,10


## Step 2: Scenario/About Dataset

We are working in a famous music-streaming company and our goal is to determine the type of music the customer is listening to. In the future this could help to create algorithm of recommendations based on the likes of the auditor.

So based on the multiple features such as the popularity, acousticness or tempo, we have to be able to classify any other music in one of the genres. This is why this project is about classification because we have several genres and it can not be clustering cause we already know the genres and we do not want to discover knew under-genres. Moreover, this is not regression because genres are not numbers there are categories.

## Step 3: Data Loading


The dataset was taken from Purushottam Malgin on kaggle.com and is composed of 17,9965 rows and 178 columns, and all these musics are classified in11 genres : Rock, Indie, Alt, Pop, Metal, HipHop, Blues, Acoustic/Folk, Instrumental, Country, Bollywood.

The main attributes are numerical attributes such as energy, tempo or popularity and string attributes such as the artist n and the track nameode.

P.S : Valence is the fact that the music is positive, happy ...

## Step 4: Data Wrangling or Data Pre-processing

### Handle missing values

In [53]:
df.loc[:, :] = df.replace('?', np.nan)
missing_data = df.isnull()
missing_data[0:10]

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False


In [54]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

Artist Name
Artist Name
False    17996
Name: count, dtype: int64

Track Name
Track Name
False    17996
Name: count, dtype: int64

Popularity
Popularity
False    17568
True       428
Name: count, dtype: int64

danceability
danceability
False    17996
Name: count, dtype: int64

energy
energy
False    17996
Name: count, dtype: int64

key
key
False    15982
True      2014
Name: count, dtype: int64

loudness
loudness
False    17996
Name: count, dtype: int64

mode
mode
False    17996
Name: count, dtype: int64

speechiness
speechiness
False    17996
Name: count, dtype: int64

acousticness
acousticness
False    17996
Name: count, dtype: int64

instrumentalness
instrumentalness
False    13619
True      4377
Name: count, dtype: int64

liveness
liveness
False    17996
Name: count, dtype: int64

valence
valence
False    17996
Name: count, dtype: int64

tempo
tempo
False    17996
Name: count, dtype: int64

duration_in min/ms
duration_in min/ms
False    17996
Name: count, dtype: int64

time_signatur

Popularity : 428 
key : 2014
instrumentalness : 4377

In [55]:
avg_popularity = df["Popularity"].astype("float").mean(axis=0)
print(avg_popularity)

44.51212431693989


In [56]:
df.loc[df['Popularity'].isna(), 'Popularity'] = avg_popularity

In [57]:
df.loc[df['key'].isna(), 'key'] = df['key'].value_counts().idxmax()
df.loc[df['instrumentalness'].isna(), 'instrumentalness'] = df['instrumentalness'].value_counts().idxmax()

In [58]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

Artist Name
Artist Name
False    17996
Name: count, dtype: int64

Track Name
Track Name
False    17996
Name: count, dtype: int64

Popularity
Popularity
False    17568
True       428
Name: count, dtype: int64

danceability
danceability
False    17996
Name: count, dtype: int64

energy
energy
False    17996
Name: count, dtype: int64

key
key
False    15982
True      2014
Name: count, dtype: int64

loudness
loudness
False    17996
Name: count, dtype: int64

mode
mode
False    17996
Name: count, dtype: int64

speechiness
speechiness
False    17996
Name: count, dtype: int64

acousticness
acousticness
False    17996
Name: count, dtype: int64

instrumentalness
instrumentalness
False    13619
True      4377
Name: count, dtype: int64

liveness
liveness
False    17996
Name: count, dtype: int64

valence
valence
False    17996
Name: count, dtype: int64

tempo
tempo
False    17996
Name: count, dtype: int64

duration_in min/ms
duration_in min/ms
False    17996
Name: count, dtype: int64

time_signatur

### Change the data types

In [59]:
df.info();


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17996 entries, 0 to 17995
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist Name         17996 non-null  object 
 1   Track Name          17996 non-null  object 
 2   Popularity          17996 non-null  float64
 3   danceability        17996 non-null  float64
 4   energy              17996 non-null  float64
 5   key                 17996 non-null  float64
 6   loudness            17996 non-null  float64
 7   mode                17996 non-null  int64  
 8   speechiness         17996 non-null  float64
 9   acousticness        17996 non-null  float64
 10  instrumentalness    17996 non-null  float64
 11  liveness            17996 non-null  float64
 12  valence             17996 non-null  float64
 13  tempo               17996 non-null  float64
 14  duration_in min/ms  17996 non-null  float64
 15  time_signature      17996 non-null  int64  
 16  Clas

In [60]:
df['mode'] = df['mode'].astype(float)
df['time_signature'] = df['time_signature'].astype(float)
df['Class'] = df['Class'].astype(float)
df.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17996 entries, 0 to 17995
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist Name         17996 non-null  object 
 1   Track Name          17996 non-null  object 
 2   Popularity          17996 non-null  float64
 3   danceability        17996 non-null  float64
 4   energy              17996 non-null  float64
 5   key                 17996 non-null  float64
 6   loudness            17996 non-null  float64
 7   mode                17996 non-null  float64
 8   speechiness         17996 non-null  float64
 9   acousticness        17996 non-null  float64
 10  instrumentalness    17996 non-null  float64
 11  liveness            17996 non-null  float64
 12  valence             17996 non-null  float64
 13  tempo               17996 non-null  float64
 14  duration_in min/ms  17996 non-null  float64
 15  time_signature      17996 non-null  float64
 16  Clas

### Encode Artist Name and Track Name

In [61]:
df["Artist Name"].value_counts()

Artist Name
Backstreet Boys                                                               69
Westlife                                                                      60
Britney Spears                                                                54
Eyal Golan                                                                    48
Omer Adam                                                                     48
                                                                              ..
Richard Hawley                                                                 1
Gabriel Fauré, Henri Büsser, Ina-Esther Joost Ben-Sasson, Allan Sternfield     1
Ilaiyaraaja, K. J. Yesudas                                                     1
Blaenavon                                                                      1
The Lazy Jesus                                                                 1
Name: count, Length: 9149, dtype: int64

In [62]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,Artist Name,Track Name
0,Bruno Mars,That's What I Like (feat. Gucci Mane)
1,Boston,Hitch a Ride
2,The Raincoats,No Side to Fall In
3,Deno,Lingo (feat. J.I & Chunkz)
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered


In [63]:
obj_df["Artist Name"] = obj_df["Artist Name"].astype('category')
obj_df.dtypes

obj_df["Artist Name_cat"] = obj_df["Artist Name"].cat.codes
obj_df.head()

obj_df.loc[obj_df['Artist Name'] == "Bruno Mars"]


Unnamed: 0,Artist Name,Track Name,Artist Name_cat
0,Bruno Mars,That's What I Like (feat. Gucci Mane),1182
2742,Bruno Mars,Talking to the Moon,1182
13085,Bruno Mars,Marry You,1182
16996,Bruno Mars,Just The Way You Are,1182


In [64]:
obj_df["Track Name"] = obj_df["Track Name"].astype('category')
obj_df.dtypes

obj_df["Track Name_cat"] = obj_df["Track Name"].cat.codes
obj_df.head()

obj_df.loc[obj_df['Track Name_cat'] == 0]

Unnamed: 0,Artist Name,Track Name,Artist Name_cat,Track Name_cat
2089,The Gaslight Anthem,"""45""",7716,0


In [65]:
obj_df.head()

Unnamed: 0,Artist Name,Track Name,Artist Name_cat,Track Name_cat
0,Bruno Mars,That's What I Like (feat. Gucci Mane),1182,11406
1,Boston,Hitch a Ride,1092,4988
2,The Raincoats,No Side to Fall In,7899,8144
3,Deno,Lingo (feat. J.I & Chunkz),1987,6753
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,6283,8167


In [66]:
df["Artist_Name_cat"] = obj_df["Artist Name_cat"]
df["Track_Name_cat"] = obj_df["Track Name_cat"]

df['Artist_Name_cat'] = df['Artist_Name_cat'].astype(float)
df['Track_Name_cat'] = df['Track_Name_cat'].astype(float)

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17996 entries, 0 to 17995
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist Name         17996 non-null  object 
 1   Track Name          17996 non-null  object 
 2   Popularity          17996 non-null  float64
 3   danceability        17996 non-null  float64
 4   energy              17996 non-null  float64
 5   key                 17996 non-null  float64
 6   loudness            17996 non-null  float64
 7   mode                17996 non-null  float64
 8   speechiness         17996 non-null  float64
 9   acousticness        17996 non-null  float64
 10  instrumentalness    17996 non-null  float64
 11  liveness            17996 non-null  float64
 12  valence             17996 non-null  float64
 13  tempo               17996 non-null  float64
 14  duration_in min/ms  17996 non-null  float64
 15  time_signature      17996 non-null  float64
 16  Clas

### Duplicated rows handling

In [67]:
duplicateRows = df[df.duplicated(subset = df.columns.difference(['Artist Name']))]

print(duplicateRows)

Empty DataFrame
Columns: [Artist Name, Track Name, Popularity, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_in min/ms, time_signature, Class, Artist_Name_cat, Track_Name_cat]
Index: []


There is no duplicated value in the data. 

### Column name handling
One important thing is to edit column name to to pass from for example "Artist Name" to "Artiste_Name"

In [68]:
print("Original column names = ", df.columns)

df.columns= df.columns.str.replace(" ","_").str.lower()
print("Column names after conversion = ", df.columns)

Original column names =  Index(['Artist Name', 'Track Name', 'Popularity', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_in min/ms', 'time_signature', 'Class', 'Artist_Name_cat',
       'Track_Name_cat'],
      dtype='object')
Column names after conversion =  Index(['artist_name', 'track_name', 'popularity', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_in_min/ms', 'time_signature', 'class', 'artist_name_cat',
       'track_name_cat'],
      dtype='object')


### Music time handling

In the colum duration some data are given in minutes and some in millisecond. To have all the data in the same scale we pass every durations in minutes.


In [69]:
df.loc[(df['duration_in_min/ms'] < 30)]['duration_in_min/ms']

7        3.105783
10       4.330450
13       4.440250
25       4.015633
34       3.503783
           ...   
17952    5.407783
17959    3.686017
17974    3.408667
17986    4.392883
17988    3.787783
Name: duration_in_min/ms, Length: 2580, dtype: float64

We can see that we get 2580 data given in minutes.

In [None]:
condition = df['duration_in_min/ms'] < 30 
# If the value in duration_in_min/ms column is less than 30, then multiply the value with 60,000
df.loc[condition,'duration_in_min/ms'] = df.loc[condition,'duration_in_min/ms']*60000

## Step 5: Exploratory Data Analysis

In [70]:
df.head()

Unnamed: 0,artist_name,track_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in_min/ms,time_signature,class,artist_name_cat,track_name_cat
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1.0,0.0485,0.0171,0.000109,0.0849,0.899,134.071,234596.0,4.0,5.0,1182.0,11406.0
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1.0,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4.0,10.0,1092.0,4988.0
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1.0,0.0525,0.486,0.000196,0.394,0.787,147.681,109667.0,4.0,6.0,7899.0,8144.0
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0.0,0.0555,0.0212,0.000109,0.122,0.569,107.033,173968.0,4.0,5.0,1987.0,6753.0
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1.0,0.216,0.000169,0.0161,0.172,0.0918,199.06,229960.0,4.0,10.0,6283.0,8167.0
