# Load the data

In [None]:
# Import modules
from sklearn import datasets
import numpy as np
import pandas as pd

In [None]:
# Load the dataset
iris = datasets.load_iris()

print(iris)

In [None]:
# Transform the dataset in dataframe
dfIris = pd.DataFrame(
        data = np.c_[iris.data, iris.target],
        columns = iris.feature_names + ["target"]
        )
dfIris["target"] = dfIris["target"].astype("category")
dfIris["target"] = dfIris["target"].cat.rename_categories(iris.target_names)

print(dfIris.to_markdown())

In [None]:
# Prepare data (training and testing sets)
from sklearn.model_selection import train_test_split
X = dfIris.drop("target", 1)
y = dfIris["target"]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.33, random_state=1) # random state for repeatability

print(Xtrain, type(Xtrain))
print(ytrain, type(ytrain))

# Function for visualizing the errors

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score

def show_errors(model, parameter, start_value, end_value, Xtrain, ytrain, Xtest, ytest):
  total_train_error = 0
  total_test_error = 0
  start_nb = start_value
  end_nb = end_value
  all_train_errors = np.array([float(i) for i in range(1, (end_nb-start_nb+1))])
  all_test_errors = np.array([float(i) for i in range(1, (end_nb-start_nb+1))])
  x = np.array([i for i in range(start_nb, end_nb)])
  for nb in range(start_nb, end_nb):
    current_model = model
    current_model.set_params(**{parameter: nb})
    current_model.fit(Xtrain, ytrain)

    train_error = 1-accuracy_score(ytrain, current_model.predict(Xtrain))
    total_train_error += train_error
    all_train_errors[nb - start_nb] = train_error

    test_error = 1-accuracy_score(ytest, current_model.predict(Xtest))
    total_test_error += test_error
    all_test_errors[nb - start_nb] = test_error

  print('The average train error is: %s.' % (total_train_error / (end_nb-start_nb)))
  print('The average test error is: %s.' % (total_test_error / (end_nb-start_nb)))
  plt.plot(x, all_train_errors, 'r')
  plt.plot(x, all_test_errors, 'b')
  plt.legend(['training error', 'test error'])
  plt.xlabel('value metaparameter')
  plt.ylabel('error')
  plt.show()
  print('The optimal metaparameter is %s for a test error of %s.' %(str(np.argmin(all_test_errors)+start_nb), str(np.min(all_test_errors))))

# Naive Bayes

In [None]:
# Import the model
from sklearn.naive_bayes import GaussianNB

# Create an instance of the model
model_NB = GaussianNB()

# Train the model
model_NB.fit(Xtrain, ytrain)

# Predict test set
ypredict_NB = model_NB.predict(Xtest)

print(ypredict_NB, type(ypredict_NB))

In [None]:
# Evaluate performances of model in terms of accuracy
from sklearn.metrics import accuracy_score

print(accuracy_score(ytest, ypredict_NB))

In [None]:
# Investigate the results
from sklearn.metrics import confusion_matrix
labels = dfIris["target"].cat.categories
cm_NB = pd.DataFrame(confusion_matrix(ytest, ypredict_NB, labels=labels),
              columns = labels,
              index = labels
              )

# i-th row and j-th column entry indicates the number of samples with true 
# label being i-th class and prediced label being j-th class.
print(confusion_matrix(ytest, ypredict_NB, labels=labels))
print(cm_NB)

# KNN

In [None]:
# Import the model
from sklearn.neighbors import KNeighborsClassifier

# Create an instance of the model
model_KNN = KNeighborsClassifier(n_neighbors = 1)

# Train the model
model_KNN.fit(Xtrain, ytrain)

# Predict test set
ypredict_KNN = model_KNN.predict(Xtest)

print(ypredict_KNN, type(ypredict_KNN))

In [None]:
# Evaluate performances of model in terms of accuracy
from sklearn.metrics import accuracy_score

print(accuracy_score(ytest, ypredict_KNN))

In [None]:
# Investigate the results
from sklearn.metrics import confusion_matrix
labels = dfIris["target"].cat.categories
cm_KNN = pd.DataFrame(confusion_matrix(ytest, ypredict_KNN, labels=labels),
              columns = labels,
              index = labels
              )

# i-th row and j-th column entry indicates the number of samples with true 
# label being i-th class and prediced label being j-th class.
print(confusion_matrix(ytest, ypredict_KNN, labels=labels))
print(cm_KNN)

In [None]:
# One label at a time
# What is the percentage of true positive for a given category?
def precisionRate(confMatrix, label):
    return confMatrix.loc[label, label]/confMatrix.loc[label].sum()

print(precisionRate(cm_KNN, "virginica")) # 13/14

# What is the percentage of true negative for a given category?
def specificityRate(confMatrix, label):
    return confMatrix.drop(index=label, columns=label).values.sum()/confMatrix.drop(columns=label).values.sum()

print(specificityRate(cm_KNN, "virginica")) # 36/37

# Decision tree

In [None]:
# Import the model
from sklearn.tree import DecisionTreeClassifier

# Create an instance of the model
model_tree = DecisionTreeClassifier(random_state = 2) # random state for repeatability

# Train the model
model_tree.fit(Xtrain, ytrain)

# Predict test set
ypredict_tree = model_tree.predict(Xtest)

In [None]:
# Evaluate performances of model in terms of accuracy
from sklearn.metrics import accuracy_score

print(accuracy_score(ytest, ypredict_tree))

In [None]:
# Investigate the results
from sklearn.metrics import confusion_matrix
labels = dfIris["target"].cat.categories
cm_tree = pd.DataFrame(confusion_matrix(ytest, ypredict_tree, labels=labels),
              columns = labels,
              index = labels
              )

print(cm_tree)

In [1]:
# Visualize the tree in a file
from sklearn import tree
import pydot
tree.export_graphviz(model_tree, out_file='/content/tree.dot')  
(graph,) = pydot.graph_from_dot_file("/content/tree.dot")
graph.write_png("/content/tree.png")  

NameError: ignored

In [None]:
# Visualize the tree in python
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(model_tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

# Random forest

In [None]:
# Import the model
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the model
model_forest = RandomForestClassifier(n_estimators = 100, random_state = 3) # random state for repeatability

# Train the model
model_forest.fit(Xtrain, ytrain)

# Predict test set
ypredict_forest = model_forest.predict(Xtest)
print(model_forest.feature_importances_)

In [None]:
# Evaluate performances of model in terms of accuracy
from sklearn.metrics import accuracy_score

print(accuracy_score(ytest, ypredict_forest))

In [None]:
# Investigate the results
from sklearn.metrics import confusion_matrix
labels = dfIris["target"].cat.categories
cm_forest = pd.DataFrame(confusion_matrix(ytest, ypredict_forest, labels=labels),
              columns = labels,
              index = labels
              )

# i-th row and j-th column entry indicates the number of samples with true 
# label being i-th class and prediced label being j-th class.
print(confusion_matrix(ytest, ypredict_forest, labels=labels))
print(cm_forest)

In [None]:
# One label at a time
# What is the percentage of true positive for a given category?
def precisionRate(confMatrix, label):
    return confMatrix.loc[label, label]/confMatrix.loc[label].sum()

print(precisionRate(cm_forest, "virginica")) 

# What is the percentage of true negative for a given category?
def specificityRate(confMatrix, label):
    return confMatrix.drop(index=label, columns=label).values.sum()/confMatrix.drop(columns=label).values.sum()

print(specificityRate(cm_forest, "virginica")) 

# SVM

In [None]:
# Import the model
from sklearn.svm import SVC

# Create an instance of the model
model_SVM = SVC(kernel='linear')

# Train the model
model_SVM.fit(Xtrain, ytrain)

# Predict test set
ypredict_SVM = model_SVM.predict(Xtest)

In [None]:
# Evaluate performances of model in terms of accuracy
from sklearn.metrics import accuracy_score

print(accuracy_score(ytest, ypredict_SVM))

In [None]:
# Investigate the results
from sklearn.metrics import confusion_matrix
labels = dfIris["target"].cat.categories
cm_SVM = pd.DataFrame(confusion_matrix(ytest, ypredict_SVM, labels=labels),
              columns = labels,
              index = labels
              )

# i-th row and j-th column entry indicates the number of samples with true 
# label being i-th class and prediced label being j-th class.
print(confusion_matrix(ytest, ypredict_SVM, labels=labels))
print(cm_SVM)

In [None]:
# One label at a time
# What is the percentage of true positive for a given category?
def precisionRate(confMatrix, label):
    return confMatrix.loc[label, label]/confMatrix.loc[label].sum()

print(precisionRate(cm_SVM, "virginica")) 

# What is the percentage of true negative for a given category?
def specificityRate(confMatrix, label):
    return confMatrix.drop(index=label, columns=label).values.sum()/confMatrix.drop(columns=label).values.sum()

print(specificityRate(cm_SVM, "virginica")) 

# Neural network

In [None]:
# Import the model
from sklearn.neural_network import MLPClassifier

# Create an instance of the model
model_NN = MLPClassifier(hidden_layer_sizes=(5), solver='lbfgs')

# Train the model
model_NN.fit(Xtrain, ytrain)

# Predict test set
ypredict_NN = model_NN.predict(Xtest)

In [None]:
# Evaluate performances of model in terms of accuracy
from sklearn.metrics import accuracy_score

print(accuracy_score(ytest, ypredict_NN))

In [None]:
# Investigate the results
from sklearn.metrics import confusion_matrix
labels = dfIris["target"].cat.categories
cm_NN = pd.DataFrame(confusion_matrix(ytest, ypredict_NN, labels=labels),
  columns = labels,
  index = labels
)

# i-th row and j-th column entry indicates the number of samples with true 
# label being i-th class and prediced label being j-th class.
print(confusion_matrix(ytest, ypredict_NN, labels=labels))
print(cm_NN)

In [None]:
# One label at a time
# What is the percentage of true positive for a given category?
def precisionRate(confMatrix, label):
    return confMatrix.loc[label, label]/confMatrix.loc[label].sum()

print(precisionRate(cm_NN, "virginica")) 

# What is the percentage of true negative for a given category?
def specificityRate(confMatrix, label):
    return confMatrix.drop(index=label, columns=label).values.sum()/confMatrix.drop(columns=label).values.sum()

print(specificityRate(cm_NN, "virginica")) 

# Model optimization

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [None]:
# K-fold
model_tree = DecisionTreeClassifier(random_state = 2)
scores = list()
kf = KFold(n_splits=3, shuffle=True)
labels = dfIris["target"].cat.categories
for train_index, test_index in kf.split(X):
  X_train, X_test = X.values[train_index], X.values[test_index]
  y_train, y_test = y.values[train_index], y.values[test_index]
  model_tree.fit(X_train, y_train)
  scores.append(model_tree.score(X_test, y_test))
  pred = model_tree.predict(X_test)

print('The tree model gives a testing score of '+str(np.mean(scores))+' and a variance of '+str(np.std(scores)))

The tree model gives a testing score of 0.94 and a variance of 0.016329931618554488


In [None]:
# Grid search
# Train the model
parameters_tree = {"max_depth": (2, 3, 4, 5), "max_leaf_nodes": (3, 5, 7, 9, None)}
model_tree = DecisionTreeClassifier(random_state = 2)
best_tree = GridSearchCV(model_tree, parameters_tree, cv=10)
best_tree.fit(Xtrain, ytrain)
print(best_tree.best_params_)  
print("The best parameters for the tree model give the following validation score: " + str(best_tree.best_score_))

# Test the model
print(accuracy_score(ytest, best_tree.predict(Xtest)))

{'max_depth': 2, 'max_leaf_nodes': 3}
The best parameters for the tree model give the following validation score: 0.9600000000000002
0.96
