In [14]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier #machine learning algorithm (Syntax: from package.module import Class)

music_data = pd.read_csv("music.csv")
X = music_data.drop(columns = ["genre"]) #drop a column from the dataset (won't modify original dataset, it will create a new one without the genre column, this line is the input set, Capital X is the conventional name for it
y = music_data["genre"] #how to get all the values in a given column, this line is the output set, small y is the conventional name for it

model = DecisionTreeClassifier() #creating an instance of this Class
model.fit(X, y) #training our model with fit() method, takes two datasets, input and output
predictions = model.predict([ [21, 1], [22, 0] ]) #ask the model to make a prediction, predict() method takes 2D arrays
predictions



array(['HipHop', 'Dance'], dtype=object)

In [5]:
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [33]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split #importing a function to split the dataset into two, one for training the model 80% and one for testing 20%
from sklearn.metrics import accuracy_score #function used to mesaure the accuracy of our prediction

music_data = pd.read_csv("music.csv")
X = music_data.drop(columns = ["genre"])
y = music_data["genre"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) #specifying the size of the test dataset as 20%, allocating 20% of the data for testing, this function returns a tuple, so we'll unpack it into four variables, train_test_split() function randomly picks data for training and testing, so the result will be different everytime, if we increase the testing dataset to 80% for example, the accuracy of the prediction will drop significantly, i.e the more training data, the better results

model = DecisionTreeClassifier() 
model.fit(X_train, y_train) #instead of passing the entire dataset for training, we pass only the training dataset
input_data = pd.DataFrame(X_test, columns = X.columns) #using two samples was not enough for predictions, now we're using the testing dataset
predictions = model.predict(input_data) 

score = accuracy_score(y_test, predictions) #comparing the output from our test dataset to the predicted output, returns an accuracy score between 0 to 1
score

1.0

In [38]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

music_data = pd.read_csv("music.csv")
X = music_data.drop(columns = ["genre"])
y = music_data["genre"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

score = accuracy_score(y_test, predictions)
score

0.75

In [43]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib #object that has methods for saving and loading models, so that learned models get saved, and in the next time they get loaded before learning, to avoid relearning things

# music_data = pd.read_csv("music.csv")
# X = music_data.drop(columns = ["genre"])
# y = music_data["genre"]

# model = DecisionTreeClassifier() 
# model.fit(X, y)

# joblib.dump(model, 'music-recommender.joblib') #save the learned model, takes two arguments, the name of the model and the name of the file that the model will be stored into 
model = joblib.load('music-recommender.joblib')
predictions = model.predict([ [21, 1], [22, 0] ])
predictions



array(['HipHop', 'Dance'], dtype=object)

In [44]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib


model = joblib.load('music-recommender.joblib')

predictions = model.predict([ [21, 1], [22, 0] ])
predictions



array(['HipHop', 'Dance'], dtype=object)

In [46]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree # this object has a method for exporting the decision tree in a graphical format

music_data = pd.read_csv("music.csv")
X = music_data.drop(columns = ["genre"])
y = music_data["genre"]

model = DecisionTreeClassifier()
model.fit(X, y)

tree.export_graphviz(model, #the model
                     out_file = "music-recommender.dot", # name of the output file, this is using keyword arguments, to selectively pass an arguemt without worrying about its order, the .dot format is a graph description language
                     feature_names = ["age", "gender"], # feature_names will be the columns/properties/features of the data, this line is so we can see the rules for each node (whether it was male/female and their age etc)
                     class_names = sorted(y.unique()), #class_names = will be set to the list of classes/labels we have in our output dataset, the y dataset includes all the genres, but they're repeated, so we'll use .unique method to get the uniqe list without repitions, then sort it alphabetically using sorted() function, this line will display the class (genre) for each node
                     label = "all", # so every node has labels (text areas) so we can read them
                    rounded = True, #rounded corners for the square nodes
                    filled = True) #means each node will be filled with a color