### step 1: <mark>Data Wrangling</mark>

Get data from API 
and shown as a dataframe 🚀 

In [58]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)
iris

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


### step 2: <mark>Data Cleaning</mark>

Save the original dataset before cleaning.

In [59]:
iris_clean = iris.copy()

In [60]:
iris_clean.duplicated().sum()

np.int64(3)

In [61]:
iris_unique = iris_clean.drop_duplicates()

In [62]:
iris_unique.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
iris_unique

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


Drop label for X.

In [63]:
X_iris = iris_unique.drop('label', axis=1)
X_iris

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


Transform label from text to number.

In [64]:
le = LabelEncoder()
iris_unique['class'] = le.fit_transform(iris_unique['label'])
y_iris = iris_unique['class']
y_iris

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iris_unique['class'] = le.fit_transform(iris_unique['label'])


0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: class, Length: 147, dtype: int64

### step 3: <mark>Create Model with Naïve Bayes and kNN</mark>

In this step, we are gonna create a model using Naïve Bayes and kNN for n times (default is 30 times). As well as that, we are gonna test each model from n times by 10-fold Cross-Validation method and get the avg of it.

> import used libs.

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

- Naïve Bayes

In [66]:
fold = 10
iterations = 30
sum_score = 0.0

def get_dataset(seed):
    return train_test_split(X_iris, y_iris,random_state=seed) # default: 75 / 25 %

def get_cross_val_score(model):
    cv = KFold(n_splits=fold, shuffle=True)
    score = cross_val_score(model, X_iris, y_iris, cv=cv)
    return score

def NaiveBayes():
    scores = []
    for i in range(iterations):
        Xtrain, Xtest, ytrain, ytest = get_dataset(seed=i) 
        model = GaussianNB()
        model.fit(Xtrain, ytrain)
        y_model = model.predict(Xtest)
        score = get_cross_val_score(model)
        scores.append(score)
    return scores

scores = NaiveBayes()
avg_scores = np.mean(scores)

avg_score = sum_score / iterations
print("10-fold Croos-Validation AVG score (Bayes):", avg_scores)

10-fold Croos-Validation AVG score (Bayes): 0.9523174603174603


- kNN

In [67]:
fold = 10
iterations = 30
sum_score = 0.0


def get_dataset(seed):
    return train_test_split(X_iris, y_iris,random_state=seed) # default: 75 / 25 %

def get_cross_val_score(model):
    cv = KFold(n_splits=fold, shuffle=True)
    score = cross_val_score(model, X_iris, y_iris, cv=cv)
    return score

def kNN():
    scores = []
    for i in range(iterations):
        Xtrain, Xtest, ytrain, ytest = get_dataset(seed=i) 
        model = KNeighborsClassifier(n_neighbors=1)
        model.fit(Xtrain, ytrain)
        y_model = model.predict(Xtest)
        score = get_cross_val_score(model)
        scores.append(score)
    return scores

scores = kNN()
avg_scores = np.mean(scores)

print("10-fold Croos-Validation AVG score (kNN):", avg_scores)

10-fold Croos-Validation AVG score (kNN): 0.9587777777777777
