# Decision Tree Algorithm on Binary Classification

In [1]:
# Import libraries
import pandas as pd

In [3]:
# Load data
df = pd.read_csv('../datasets/mldata2.csv')
df.head()

Unnamed: 0,age,height,weight,gender,likeness
0,27,170.688,76.0,Male,Biryani
1,41,165.0,70.0,Male,Biryani
2,29,171.0,80.0,Male,Biryani
3,27,173.0,102.0,Male,Biryani
4,29,164.0,67.0,Male,Biryani


In [4]:
# Convert 'gender' data type to int
df['gender'] = df['gender'].replace('Male', 1)
df['gender'] = df['gender'].replace('Female', 0)

In [8]:
df['gender'].unique()

array([1, 0], dtype=int64)

In [12]:
# Selection of input and output variables
X = df[['weight', 'gender']]
y = df['likeness']

In [13]:
X.head()

Unnamed: 0,weight,gender
0,76.0,1
1,70.0,1
2,80.0,1
3,102.0,1
4,67.0,1


In [14]:
y.head()

0    Biryani
1    Biryani
2    Biryani
3    Biryani
4    Biryani
Name: likeness, dtype: object

In [16]:
# ML algorithm
from sklearn.tree import DecisionTreeClassifier

# Create and fit the model
model = DecisionTreeClassifier().fit(X, y)

# Prediction
model.predict([[23, 0]])



array(['Biryani'], dtype=object)

In [20]:
# How to measure the accuracy of our model
# Split data into test and train (80/20)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 80% training data and 20% testing

# Create a model
model = DecisionTreeClassifier()
# Fit the model
model.fit(X_train, y_train)

# Make predictions
predicted_values = model.predict(X_test)

# Check score
score = accuracy_score(y_test, predicted_values)
score

0.6122448979591837

In [21]:
# Train and save model
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib

# Create and fit model
model = DecisionTreeClassifier().fit(X, y)

# Save model (extension joblib)
joblib.dump(model, 'foodie.joblib')

['foodie.joblib']

In [23]:
# Make Graph
from sklearn import tree

# Create and fit model
model = DecisionTreeClassifier().fit(X, y)
# Graphic evaluation/look into what happened
tree.export_graphviz(model, out_file='foodie.dot', # ".dot" file extension
                     feature_names=['age', 'gender'],
                     class_names=sorted(y.unique()),
                     label='all',
                     rounded=True,
                     filled=True)