### Loading Libraries

In [23]:
# Load library with iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandaas
import pandas as pd

# Load numpy
import numpy as np

# Setting random seed
np.random.seed(0)


### Load Dataset

In [9]:
# Create object with iris data objects
iris = load_iris()

# Creating a dataframe
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# Adding 'species' column to the dataset
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa



### Creating Train & Test Data

In [14]:
# Creating Indices Column
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

df.head()

# Creating Train, Test Data
train_df, test_df = df[df['is_train']==True], df[df['is_train']==False]

print(f'Observations in training data: {len(train_df)}')
print(f'Observation in test data: {len(test_df)}')

Observations in training data: 114
Observation in test data: 36



### List of Features

In [16]:
# List of feature column names
features = df.columns[:4]
print(features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')



### Converting Species to Factor

In [19]:
target = pd.factorize(train_df['species'])[0]

print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]


### Creating Random Forest Classifier

In [26]:
# Creating random forest classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Training the classifier
clf.fit(train_df[features], target)

RandomForestClassifier(n_jobs=2, random_state=0)

### Testing Classifier on Test Data

In [37]:
# Applying trained Classifier to the test data
clf.predict(test_df[features])

# Viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test_df[features])

# Mapping names for the flowers for each flower class
preds = iris.target_names[clf.predict(test_df[features])]

# Creating confusion matrix
pd.crosstab(test_df['species'], preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,setosa,versicolor,virginica
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,9,0,0
versicolor,0,12,2
virginica,0,0,13
