In [2]:
import pandas as pd
import numpy as np

In [4]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,instance_id,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,voice_gender,mode,speechiness,tempo,musician_category,valence
0,MSC_70753.0,"Bard, a Man of Lake-town - Extended Version",26.101,0.823765,0.11549,211462.287,0.125325,0.945272,Scale A Sharp,0.120392,-25.541,Both,Minor,0.038657,80.777,Duet,0.041238
1,MSC_24064.0,New Kings,66.325,0.015441,0.592838,249151.238,0.870798,0.000108,Scale E,0.406722,-3.905,Male,?,0.051368,76.00399999999999,Band,0.456309
2,MSC_22731.0,Silver Lining,71.871,0.001006,0.51034,215693.24,0.683077,0.0,Scale C,0.084608,-6.825,Female,Major,0.034303,144.458,Solo,0.193332
3,MSC_32095.0,A Tale That Wasn't Right,56.151,0.31565,0.338336,307056.179,0.608904,0.0,Scale A,0.276811,-8.382,Female,Minor,0.036934,118.145,Solo,0.214906
4,MSC_24198.0,Get Ready,43.687,0.038336,0.773904,352802.872,0.9555,7e-06,Scale D,0.301726,-4.733,Both,Major,0.079688,95.012,Duet,0.757347


In [5]:
#Let's check the null value
df_test.isna().sum()

instance_id            0
track_name             0
popularity           166
acousticness           0
danceability         139
duration_ms           21
energy                33
instrumentalness      23
key                    0
liveness               0
loudness               3
voice_gender         186
mode                  14
speechiness            7
tempo                  0
musician_category    312
valence              191
dtype: int64

In [8]:
# fill all the Nan value with mean wherever wee can
df_test = df_test.fillna(df_test.mean())

df_test.isna().sum()

instance_id            0
track_name             0
popularity             0
acousticness           0
danceability           0
duration_ms            0
energy                 0
instrumentalness       0
key                    0
liveness               0
loudness               0
voice_gender         186
mode                  14
speechiness            0
tempo                  0
musician_category    312
valence                0
dtype: int64

In [10]:
#As mode column has '?' so we will solve it late and we will first fill other 2 columns.
df_test['voice_gender'].fillna(df_test['voice_gender'].mode()[0], inplace=True)
df_test['musician_category'].fillna(df_test['musician_category'].mode()[0], inplace=True)

In [12]:
#as we can see mode and temo has the ? so, we will replace it by null
df_test['mode'] = df_test['mode'].apply(lambda x:np.nan if x == '?' else x)
df_test['tempo'] = df_test['tempo'].apply(lambda x:np.nan if x == '?' else x)

In [13]:
#from above we found that 'tempo' is object type but it is look like float. So we will have to change its data type
df_test['tempo'] = df_test['tempo'].astype('float64')

In [14]:
#Now let's remove the null values
df_test = df_test.fillna(df_test.mean())

df_test['mode'].fillna(df_test['mode'].mode()[0], inplace=True)

In [15]:
df_test.isna().sum()

instance_id          0
track_name           0
popularity           0
acousticness         0
danceability         0
duration_ms          0
energy               0
instrumentalness     0
key                  0
liveness             0
loudness             0
voice_gender         0
mode                 0
speechiness          0
tempo                0
musician_category    0
valence              0
dtype: int64

In [16]:
#let's check the no. of unique values in data
def print_unique_col_values(df):
       for column in df:
            if df[column].dtypes=='object':
                print(f'{column}: {df[column].unique()}') 

In [17]:
print_unique_col_values(df_test)

instance_id: ['MSC_70753.0' 'MSC_24064.0' 'MSC_22731.0' ... 'MSC_40192.0' 'MSC_56067.0'
 'MSC_90169.0']
track_name: ['Bard, a Man of Lake-town - Extended Version' 'New Kings' 'Silver Lining'
 ...
 'Aria mit 30 Veränderungen, BWV 988 "Goldberg Variations": Aria - Live'
 'Buy Me A Rose' 'Tankwa Town']
key: ['Scale A Sharp' 'Scale E' 'Scale C' 'Scale A' 'Scale D' 'Scale B'
 'Scale G' 'Scale F' 'Scale F Sharp' 'Scale D Sharp' 'Scale C Sharp'
 'Scale G Sharp']
voice_gender: ['Both' 'Male' 'Female']
mode: ['Minor' 'Major']
musician_category: ['Duet' 'Band' 'Solo']


In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [20]:
df_test['track_name'] = le.fit_transform(df_test['track_name'])
df_test['key'] = le.fit_transform(df_test['key'])

In [21]:
#for other categorical features, we'll use dummy encoding becuase of less unique values
dummy1 = pd.get_dummies(df_test['voice_gender'],drop_first=True)
dummy2 = pd.get_dummies(df_test['mode'],drop_first=True)
dummy3 = pd.get_dummies(df_test['musician_category'],drop_first=True)

In [22]:
#Let's concate these dummies to main dataframe
df1 = pd.concat([df_test,dummy1,dummy2,dummy3],axis=1)

In [23]:
#time to drop the original feature of dummies including id
new_test_data = df1.drop(['instance_id','voice_gender','mode','musician_category'],axis=1)

In [24]:
#As we can see above duration_ms has -1 value which is not posible, so let's change it with 0
new_test_data['duration_ms'] = new_test_data['duration_ms'].apply(lambda x:0 if x == -1 else x)

In [25]:
new_test_data

Unnamed: 0,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,valence,Female,Male,Minor,Duet,Solo
0,346,26.101,0.823765,0.115490,211462.287,0.125325,0.945272,1,0.120392,-25.541,0.038657,80.777000,0.041238,0,0,1,1,0
1,2144,66.325,0.015441,0.592838,249151.238,0.870798,0.000108,7,0.406722,-3.905,0.051368,76.004000,0.456309,0,1,0,0,0
2,2785,71.871,0.001006,0.510340,215693.240,0.683077,0.000000,3,0.084608,-6.825,0.034303,144.458000,0.193332,1,0,0,0,1
3,112,56.151,0.315650,0.338336,307056.179,0.608904,0.000000,0,0.276811,-8.382,0.036934,118.145000,0.214906,1,0,1,0,1
4,1171,43.687,0.038336,0.773904,352802.872,0.955500,0.000007,5,0.301726,-4.733,0.079688,95.012000,0.757347,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,3538,28.296,0.880373,0.239413,334617.187,0.046885,0.000008,2,0.149835,-26.465,0.044304,116.111768,0.159485,0,0,0,0,1
3917,1105,57.506,0.002973,0.658013,266627.734,-1.000000,0.000000,7,0.308676,-6.282,0.033955,120.048000,0.384031,1,0,1,0,0
3918,251,40.085,1.065965,0.199430,237468.634,0.066235,0.998231,10,0.074738,-30.092,0.051459,66.095000,0.333884,0,0,0,0,1
3919,538,52.468,0.607947,0.677778,237945.370,0.313435,0.000002,6,0.115475,-14.411,0.032643,97.999000,0.203295,0,0,0,0,0


## Loading the model and prediction

In [26]:
import pickle
model = pickle.load(open('music_genre.sav', 'rb'))

In [27]:
prediction = model.predict(new_test_data)

In [28]:
result = pd.DataFrame(prediction,columns=['music_genre'])

In [36]:
#Changing the label
result['music_genre'] = result['music_genre'].replace(0,'Classical')

In [39]:
result['music_genre'] = result['music_genre'].replace(1,'Country')
result['music_genre'] = result['music_genre'].replace(2,'Electronic')
result['music_genre'] = result['music_genre'].replace(3,'Hip-Hop')
result['music_genre'] = result['music_genre'].replace(4,'Jazz')
result['music_genre'] = result['music_genre'].replace(5,'Rap')
result['music_genre'] = result['music_genre'].replace(6,'Rock')

In [40]:
df2 = pd.read_csv('test.csv')

In [42]:
df_subm = pd.concat([df2['instance_id'],result], axis=1)

In [43]:
df_subm

Unnamed: 0,instance_id,music_genre
0,MSC_70753.0,Classical
1,MSC_24064.0,Rock
2,MSC_22731.0,Rock
3,MSC_32095.0,Rock
4,MSC_24198.0,Jazz
...,...,...
3916,MSC_80955.0,Classical
3917,MSC_72767.0,Rock
3918,MSC_40192.0,Classical
3919,MSC_56067.0,Country


In [44]:
df_subm.to_csv('music_genre_final_submission.csv') #storing in a csv file