# Import Data

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
# DeprecationWarning: sklearn.externals.joblib is \
# deprecated in 0.21 and will be removed in 0.23. \
# Please import this functionality directly from \
# joblib, which can be installed with: pip install \
# joblib. If this warning is raised when loading \
# pickled models, you may need to re-serialize those \
# models with scikit-learn 0.21+.


music_data = pd.read_csv('music.csv')
music_data




Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,Jazz
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


# Prepare Data

In [2]:
X = music_data.drop(columns=['genre'])
print(X, '\n')
y = music_data['genre']
print(y)

    age  gender
0    20       1
1    23       1
2    25       1
3    26       1
4    29       1
5    30       1
6    31       1
7    33       1
8    37       1
9    20       0
10   21       0
11   25       0
12   26       0
13   27       0
14   30       0
15   31       0
16   34       0
17   35       0 

0        HipHop
1        HipHop
2          Jazz
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object


# Learning and Predicting
Decision Tree -> sklearn

In [3]:
model = DecisionTreeClassifier()
# check your data before running
# print(music_data)
model.fit(X, y)
# Predict 21 year old male and 22 year old female
predictions = model.predict([ [21, 1], [22, 0] ])
predictions

array(['HipHop', 'Dance'], dtype=object)

# Calculating the Accuracy
70-80 percent for training, the others for testing -> sklearn.model_selection

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# sklearn.metrics
score = accuracy_score(y_test, predictions)
print(score)
# for the first time I got a poor 0.25
# The more data we give to our model and the cleaner the data is, we'll \
#   get the better result

1.0


# Persisting Models
Saving and loading models trained once in a while -> sklearn.externals

In [5]:
model = DecisionTreeClassifier()
model.fit(X, y)

# save
joblib.dump(model, 'music-recommender.joblib')

['music-recommender.joblib']

In [6]:
# load
trained_model = joblib.load('music-recommender.joblib')

predictions = model.predict([[21, 1]])
predictions

array(['HipHop'], dtype=object)

# Visualizing a Decision Tree

In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree


music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']

model = DecisionTreeClassifier()
model.fit(X, y)

tree.export_graphviz(model, out_file='music-recommender.dot',
                    feature_names=['age', 'gender'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)