# Loading the IRIS dataset from sickit-learm

In [1]:
# import load_iris function from datasets module
from sklearn.datasets import load_iris

In [2]:
# save "bunch" object containing iris dataset and its attributes
#Bunch is sklearn's special object type for storing datasets and their attributes
iris = load_iris()
type(iris)

sklearn.utils.Bunch

In [3]:
#One of Bunch attributes is called data
#150 rows and 4 columns, each row represents one flower, and the four columns represent the four measurments
print(iris.data)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

# ML terminology

-Each row is an observation (also known as: sample, example, instance, record).

-Each column is a feature (also known as: predictor, attribute, independent variable, input, regressor, covariate)

In [5]:
# print the names of the four features
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [6]:
# print integers representing the species of each observation
# we say target, reponse, outcome, label and dependent variable
print(iris.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [7]:
# print the encoding scheme for species: 0 = setosa, 1 = versicolor, 2 = virginica
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


-Each value we are predicting is the response (also known as: target, outcome, label, dependent variable).

-Classification is supervised learning in which the response is categorical.

-Regression is supervised learning in which the response is ordered and continuous.

# Requirements for working with data in scikit-learn

1-Features and response are separate objects

2-Features should always be numeric, and response should be numeric for regression problems

3-Features and response should be NumPy arrays

4-Features and response should have specific shapes

In [8]:
# check the types of the features and response
print(type(iris.data))
print(type(iris.target))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [9]:
# check the shape of the features (first dimension = number of observations, second dimensions = number of features)
print(iris.data.shape)

(150, 4)


In [10]:
# check the shape of the response (single dimension matching the number of observations)
print(iris.target.shape)

(150,)


In [11]:
# store feature matrix in "X"
X_train = iris.data

# store response vector in "y"
y_train = iris.target

# Import our Model class

Step 1: Import the class you plan to use

In [14]:
from sklearn.neighbors import KNeighborsClassifier

Step 2: "Instantiate" the "estimator"

-"Estimator" is scikit-learn's term for model

-"Instantiate" means "make an instance of"

In [15]:
knn = KNeighborsClassifier(n_neighbors=1) #the name doesnt matter its usually est or clf or model

In [16]:
print(knn)

KNeighborsClassifier(n_neighbors=1)


Step 3: Fit the model with data (aka "model training")

In [18]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

Step 4: Predict the response for a new observation

-New observations are called "out-of-sample" data

-Uses the information it learned during the model training process

In [33]:
X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]
predictions = knn.predict(X_new)
predictions

array([2, 1])

See the default parameters that we didnt specify

In [34]:
knn.get_params(deep=True)

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 1,
 'p': 2,
 'weights': 'uniform'}

In [35]:
iris.target_names[predictions]

array(['virginica', 'versicolor'], dtype='<U10')

# Using a different value for K

In [37]:
# instantiate the model (using the value K=5)
knn = KNeighborsClassifier(n_neighbors=5)

# fit the model with data
knn.fit(X_train, y_train)

# predict the response for new observations
knn.predict(X_new)

array([1, 1])