In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# In this section, we will check "train_test_split"
iris = load_iris()
dt_clf = DecisionTreeClassifier()
train_data = iris.data
train_label = iris.target
dt_clf.fit(train_data, train_label)

# Prediction based on trained data set
pred = dt_clf.predict(train_data)
print("Prediction accuracy:", accuracy_score(train_label, pred))

# The reason we got "1.0", "100%" is that we train and predict with the same data set.

In [None]:
# "test_size" is to allocate the test set size (default is 25% (0.25))
# "train_size" is not commonly used becuase we usually use "test_size"
# "random_stata", if we don't specify it, it will be ramdomized to split the data (train and test)
# "train_test_split()" returns Tuple types and data types in order feature data set of training data, feature data set of testing data, label data set of training data set, and label data set of training data

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

dt_clf = DecisionTreeClassifier()
iris_data = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.3, random_state=121)

dt_clf.fit(X_train,y_train)
pred = dt_clf.predict(X_test)
print("Prediction accuracy: {0:.4f}".format(accuracy_score(y_test, pred)) )

In [None]:
# Cross validation is to take pre-SAT tests for SAT
# This is also to avoid the overfitting issue
# k-fold cross validation divides the data set into "k" and with the same range of "k" numbers of data set, data sets (1 to k-1) is used to train and evaluated by the data set, "k". 
# And it repeats "k" times by changing the test and train data sets 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)

# it is to do k=5
kfold = KFold(n_splits=5)
cv_accuracy = []
# ".shpae[n]" shows you the n-axis (0= number of rows, 1= number of columns)
print("Size of iris data set:", features.shape[0])

In [None]:
n_iter = 0

# If call "split()" from KFold object, it returns training and testing of row index per "k" as array
for train_index, test_index in kfold.split(features):
    # Using returned "kfold.split()" index, extract training and testing data 
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    # Train and predict
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    n_iter += 1

    # Check accuracy per interation
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print("\n#{0} Cross validation accuracy: {1}, Size of training data: {2}, Size of testing data: {3}".format(n_iter, accuracy, train_size, test_size))
    print("#{0} Testing data index:{1}".format(n_iter, test_index))
    cv_accuracy.append(accuracy)

# Calculate the average accuracy by using each iteration result
print("\n## Average validation accuracy:", np.mean(cv_accuracy))
print(cv_accuracy)


In [None]:
print(label)
print(features)

In [None]:
# Stratified K fold, "StratifiedKFold", is to predict an occurance in something rarely happened (within 100,000,000 data points, there is only 2-5 phishing scams and with K fold apporoach, most of k data set says not likely happen)
# To avoid this, we introduce "StratifiedKFold"

import pandas as pd

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df['label'].value_counts()

In [None]:
# With the code below, we can't train and predict the value (0-2) because in 1 and 2, there is no feature for 0
kfold = KFold(n_splits=3)
n_iter = 0
for train_index, test_index in kfold.split(iris_df):
    n_iter+=1
    label_train=iris_df['label'].iloc[train_index]
    label_test=iris_df['label'].iloc[test_index]
    print("## Cross Validation: {0}".format(n_iter))
    print("Distribution of Training labeled data:\n", label_train.value_counts())
    print("Distribution of Testing labeled data:\n", label_test.value_counts())

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)
n_iter = 0

for train_index, test_index in skf.split(iris_df, iris_df["label"]):
    n_iter+=1
    label_train=iris_df["label"].iloc[train_index]
    label_test=iris_df["label"].iloc[test_index]
    print("## Cross Validation: {0}".format(n_iter))
    print("Distribution of Training labeled data:\n", label_train.value_counts())
    print("Distribution of Testing labeled data:\n", label_test.value_counts())

In [None]:
dt_clf = DecisionTreeClassifier(random_state=156)

skfold = StratifiedKFold(n_splits=3)
n_iter = 0
cv_accuracy = []

# It is important that we need Label data set to be typed in to use split()
for train_index, test_index in skfold.split(features, label):
    # Based on index from split(), export tesing and training data
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    # Train and predict
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)

    # Accuracy per iteration
    n_iter+=1
    accuracy=np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print("\n#{0} Cross validation accuracy: {1}, Size of traing data: {2}, Size of testing data: {3}".format(n_iter, accuracy, train_size, test_size))
    print("#{0} Testing data index:{1}".format(n_iter, test_index))
    cv_accuracy.append(accuracy)

# Accuracy per crosss validation and average accuracy of validation
print("\n## Accuracy per cross validation:", np.round(cv_accuracy, 4))
print("## Average validation accuracy:", np.round(np.mean(cv_accuracy), 4))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.datasets import load_iris

iris_data = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

data = iris_data.data
label = iris_data.target

# Score indicator is accuracy, three sets of cross validation
scores = cross_val_score(dt_clf,data, label, scoring='accuracy', cv=3)
print('Cross validation accuracy:', np.round(scores,4))
print('Average validation accuracy:', np.round(np.mean(scores),4))

In [None]:
grid_parameters = {'max_depth': [1,2,3], 'min_samples_split': [2,3]}


In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Load the data and split the data between training set and testing set
iris_data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.2, random_state=121)
dtree = DecisionTreeClassifier()

# Set parameters as disctionary type
parameters = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}



In [None]:
import pandas as pd

# Set up test execution by dividing hyperparameters of param_grid into 3 set folds: train and test
# refit=True is a default. When it is True, re-train with the best parameter
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True)

# With Iris train data, train and test hyperparameter of param_grid in order
grid_dtree.fit(X_train, y_train)

# By extracting GridSearchCV results, covert to DataFrame
score_df = pd.DataFrame(grid_dtree.cv_results_)
score_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

In [None]:
print('GridSearchCV optimal parameter:', grid_dtree.best_params_)
print('Best accuracy of GridSearchCV:{0:.4f}'.format(grid_dtree.best_score_))

In [None]:
# Based on refitted GridSearchCV, return the trained estimator
estimator = grid_dtree.best_estimator_

# No need a further train for best_estimator_ of GridSearchCV because it is already optimized 
pred = estimator.predict(X_test)
print('Test data set accuracy: {0:.4f}'.format(accuracy_score(y_test, pred)))