In [5]:
# Import Libraries
import numpy as np 
import pandas as pd
from pandas import read_csv
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
# Load dataset
filename = 'music.csv'
data = read_csv(filename)
data.head(10)

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [8]:
data.dtypes

age        int64
gender     int64
genre     object
dtype: object

In [9]:
data.shape

(18, 3)

In [12]:
# Handling duplicates
data.duplicated().sum()

0

In [13]:
# Handling missing values
data.isnull().sum()

age       0
gender    0
genre     0
dtype: int64

In [14]:
data.genre.unique()

array(['HipHop', 'Jazz', 'Classical', 'Dance', 'Acoustic'], dtype=object)

In [15]:
data.genre.replace({'HipHop':0, 'Jazz':1, 'Classical':2, 'Dance':3, 'Acoustic':4}, inplace=True)

In [16]:
data.head(10)

Unnamed: 0,age,gender,genre
0,20,1,0
1,23,1,0
2,25,1,0
3,26,1,1
4,29,1,1
5,30,1,1
6,31,1,2
7,33,1,2
8,37,1,2
9,20,0,3


In [17]:
data.dtypes

age       int64
gender    int64
genre     int64
dtype: object

In [20]:
# Feature Selection
df_x = data[['age','gender']] 
df_y = data['genre']

In [21]:
df_x.head(5)

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1


In [22]:
df_y.head(5)

0    0
1    0
2    0
3    1
4    1
Name: genre, dtype: int64

In [23]:
# Train Test Split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=42)

In [24]:
X_train.head()

Unnamed: 0,age,gender
16,34,0
15,31,0
11,25,0
2,25,1
9,20,0


In [25]:
X_test.head()

Unnamed: 0,age,gender
0,20,1
1,23,1
8,37,1
5,30,1
3,26,1


In [27]:
y_train.head()

16    2
15    2
11    3
2     0
9     3
Name: genre, dtype: int64

In [28]:
y_test.head()

0    0
1    0
8    2
5    1
3    1
Name: genre, dtype: int64

In [32]:
# Fit The Model
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
print("")
print("Below is the score of the model")
clf.score(X_test,y_test)


Below is the score of the model


1.0

In [38]:
# Cross Validation
from sklearn.model_selection import cross_val_score
v = cross_val_score(clf, X_train, y_train, cv=3)
for i in range(3):
    print("The Accuracy of Decision Tree is : {0:2%}".format(v[i,]))
print("The Average Accuracy of Decision Tree is ", v.mean())

The Accuracy of Decision Tree is : 66.666667%
The Accuracy of Decision Tree is : 100.000000%
The Accuracy of Decision Tree is : 100.000000%
The Average Accuracy of Decision Tree is  0.8888888888888888




In [41]:
# Dump the model
import joblib as joblib
joblib.dump(clf, ('model/tree.pkl'))
print("Model dumped to the disk")
model = joblib.load('model/tree.pkl')
print("Model is loaded from the disk")

Model dumped to the disk
Model is loaded from the disk


In [43]:
# Testing or Making Really Prediction
results = model.predict([[7,0]])
print(results[0])

3
