# Importing libraries

* Pandas is a library for data analysis. It is used to read and manipulate data.
* Numpy is a library for scientific computing. It is used to perform mathematical operations on data.
* Matplotlib is a library for data visualization. It is used to plot data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the data

In [2]:
df = pd.read_csv('music_genre.csv')
df

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,58878.0,BEXEY,GO GETTA,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,4-Apr,0.330,Hip-Hop
50001,43557.0,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,39767.0,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,57944.0,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


# Select important collumns

In [3]:
df = df[['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'music_genre' ]]
df.head()



Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,-1.0,0.941,0.792,0.115,-5.201,0.0748,100.889,0.759,Electronic
1,31.0,0.0127,0.622,218293.0,0.89,0.95,0.124,-7.043,0.03,115.002,0.531,Electronic
2,28.0,0.00306,0.62,215613.0,0.755,0.0118,0.534,-4.617,0.0345,127.994,0.333,Electronic
3,34.0,0.0254,0.774,166875.0,0.7,0.00253,0.157,-4.498,0.239,128.014,0.27,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.909,0.157,-6.266,0.0413,145.036,0.323,Electronic


# remove the rows with missing values

In [4]:
df.isnull().sum()

popularity          5
acousticness        5
danceability        5
duration_ms         5
energy              5
instrumentalness    5
liveness            5
loudness            5
speechiness         5
tempo               5
valence             5
music_genre         5
dtype: int64

In [5]:
df = df.dropna()
df.isnull().sum()

popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
music_genre         0
dtype: int64

# Remove broken data and set the type

In [6]:
# drop values below 0 in collumn duration_ms
df = df[df['duration_ms'] >= 0]


In [7]:
df['duration_ms'] = df['duration_ms'].astype(int)
df['popularity'] = df['popularity'].astype(int)
df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,music_genre
1,31,0.01270,0.622,218293,0.890,0.950000,0.124,-7.043,0.0300,115.00200000000001,0.531,Electronic
2,28,0.00306,0.620,215613,0.755,0.011800,0.534,-4.617,0.0345,127.994,0.333,Electronic
3,34,0.02540,0.774,166875,0.700,0.002530,0.157,-4.498,0.2390,128.014,0.270,Electronic
4,32,0.00465,0.638,222369,0.587,0.909000,0.157,-6.266,0.0413,145.036,0.323,Electronic
5,47,0.00523,0.755,519468,0.731,0.854000,0.216,-10.517,0.0412,?,0.614,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...
49999,56,0.13300,0.849,237667,0.660,0.000008,0.296,-7.195,0.0516,99.988,0.629,Hip-Hop
50001,72,0.15700,0.709,251860,0.362,0.000000,0.109,-9.814,0.0550,122.04299999999999,0.113,Hip-Hop
50002,51,0.00597,0.693,189483,0.763,0.000000,0.143,-5.443,0.1460,131.079,0.395,Hip-Hop
50003,65,0.08310,0.782,262773,0.472,0.000000,0.106,-5.016,0.0441,75.88600000000001,0.354,Hip-Hop


# Categorise energy column

In [8]:
#energy from 0-0.5 is low, 0.5-0.75 is medium, 0.75-1 is high
df['energy'] = pd.cut(df['energy'], bins=[0, 0.5, 0.75, 1], labels=['low', 'medium', 'high'])
df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,music_genre
1,31,0.01270,0.622,218293,high,0.950000,0.124,-7.043,0.0300,115.00200000000001,0.531,Electronic
2,28,0.00306,0.620,215613,high,0.011800,0.534,-4.617,0.0345,127.994,0.333,Electronic
3,34,0.02540,0.774,166875,medium,0.002530,0.157,-4.498,0.2390,128.014,0.270,Electronic
4,32,0.00465,0.638,222369,medium,0.909000,0.157,-6.266,0.0413,145.036,0.323,Electronic
5,47,0.00523,0.755,519468,medium,0.854000,0.216,-10.517,0.0412,?,0.614,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...
49999,56,0.13300,0.849,237667,medium,0.000008,0.296,-7.195,0.0516,99.988,0.629,Hip-Hop
50001,72,0.15700,0.709,251860,low,0.000000,0.109,-9.814,0.0550,122.04299999999999,0.113,Hip-Hop
50002,51,0.00597,0.693,189483,high,0.000000,0.143,-5.443,0.1460,131.079,0.395,Hip-Hop
50003,65,0.08310,0.782,262773,low,0.000000,0.106,-5.016,0.0441,75.88600000000001,0.354,Hip-Hop


# Fix tempo

In [9]:
df = df[df['tempo'].str.contains('[^0-9.]', regex=True) == False]
df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,music_genre
1,31,0.01270,0.622,218293,high,0.950000,0.124,-7.043,0.0300,115.00200000000001,0.531,Electronic
2,28,0.00306,0.620,215613,high,0.011800,0.534,-4.617,0.0345,127.994,0.333,Electronic
3,34,0.02540,0.774,166875,medium,0.002530,0.157,-4.498,0.2390,128.014,0.270,Electronic
4,32,0.00465,0.638,222369,medium,0.909000,0.157,-6.266,0.0413,145.036,0.323,Electronic
6,46,0.02890,0.572,214408,high,0.000008,0.106,-4.294,0.3510,149.995,0.230,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...
49999,56,0.13300,0.849,237667,medium,0.000008,0.296,-7.195,0.0516,99.988,0.629,Hip-Hop
50001,72,0.15700,0.709,251860,low,0.000000,0.109,-9.814,0.0550,122.04299999999999,0.113,Hip-Hop
50002,51,0.00597,0.693,189483,high,0.000000,0.143,-5.443,0.1460,131.079,0.395,Hip-Hop
50003,65,0.08310,0.782,262773,low,0.000000,0.106,-5.016,0.0441,75.88600000000001,0.354,Hip-Hop


In [10]:
# round tempo and set to int
df['tempo'] = df['tempo'].astype(float)
df['tempo'] = df['tempo'].round()
df['tempo'] = df['tempo'].astype(int)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tempo'] = df['tempo'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tempo'] = df['tempo'].round()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tempo'] = df['tempo'].astype(int)


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,music_genre
1,31,0.01270,0.622,218293,high,0.950000,0.124,-7.043,0.0300,115,0.531,Electronic
2,28,0.00306,0.620,215613,high,0.011800,0.534,-4.617,0.0345,128,0.333,Electronic
3,34,0.02540,0.774,166875,medium,0.002530,0.157,-4.498,0.2390,128,0.270,Electronic
4,32,0.00465,0.638,222369,medium,0.909000,0.157,-6.266,0.0413,145,0.323,Electronic
6,46,0.02890,0.572,214408,high,0.000008,0.106,-4.294,0.3510,150,0.230,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...
49999,56,0.13300,0.849,237667,medium,0.000008,0.296,-7.195,0.0516,100,0.629,Hip-Hop
50001,72,0.15700,0.709,251860,low,0.000000,0.109,-9.814,0.0550,122,0.113,Hip-Hop
50002,51,0.00597,0.693,189483,high,0.000000,0.143,-5.443,0.1460,131,0.395,Hip-Hop
50003,65,0.08310,0.782,262773,low,0.000000,0.106,-5.016,0.0441,76,0.354,Hip-Hop


In [11]:
df['music_genre'].unique()

array(['Electronic', 'Anime', 'Jazz', 'Alternative', 'Country', 'Rap',
       'Blues', 'Rock', 'Classical', 'Hip-Hop'], dtype=object)

In [12]:
df['energy'].unique()
df.info()

df.to_csv('music_genre_cleaned.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40560 entries, 1 to 50004
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   popularity        40560 non-null  int64   
 1   acousticness      40560 non-null  float64 
 2   danceability      40560 non-null  float64 
 3   duration_ms       40560 non-null  int64   
 4   energy            40560 non-null  category
 5   instrumentalness  40560 non-null  float64 
 6   liveness          40560 non-null  float64 
 7   loudness          40560 non-null  float64 
 8   speechiness       40560 non-null  float64 
 9   tempo             40560 non-null  int64   
 10  valence           40560 non-null  float64 
 11  music_genre       40560 non-null  object  
dtypes: category(1), float64(7), int64(3), object(1)
memory usage: 3.8+ MB
