In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iris/Iris.csv
/kaggle/input/iris/database.sqlite


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# for setting random seed
np.random.seed(0)



## Data Preparation

In [3]:
df = pd.read_csv("/kaggle/input/iris/Iris.csv")
# df= pd.DataFrame(iris,columns=iris.columns)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# creating test and train data
df['Train_d']= np.random.uniform(0,1,len(df))<=.75
df.head()
                                

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Train_d
0,1,5.1,3.5,1.4,0.2,Iris-setosa,True
1,2,4.9,3.0,1.4,0.2,Iris-setosa,True
2,3,4.7,3.2,1.3,0.2,Iris-setosa,True
3,4,4.6,3.1,1.5,0.2,Iris-setosa,True
4,5,5.0,3.6,1.4,0.2,Iris-setosa,True


In [5]:
# creating dataframes with test and training rows
train,test = df[df['Train_d']==True],df[df['Train_d']==False]

print("Number of observations in training data: ",len(train))
print("Number of observations in test data: ",len(test))

Number of observations in training data:  118
Number of observations in test data:  32


In [6]:
# list of feature columns name for reading 
features = df.columns[1:5]
features

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')

In [7]:
# converting each species into digits
y = pd.factorize(train['Species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

# Training 

In [8]:
# Creating RAndom Forest Classifier and then training
Rclf = RandomForestClassifier(n_jobs=2,random_state = 0)

Rclf.fit(train[features],y)

In [9]:
predictions = Rclf.predict(test[features])
predictions


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
Rclf.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.95, 0.05, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ]])

In [11]:
# mapping names for each predicted class 
target_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
preds =Rclf.predict(test[features])

species_preds = [target_names[code] for code in preds]
#predictions for first 10 observations
print(species_preds[:20])

['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor']


In [12]:
# creating confusion matrix
pd.crosstab(test['Species'],preds,rownames=['Actual Species'],colnames=['Predicted Species'])

Predicted Species,0,1,2
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,13,0,0
Iris-versicolor,0,5,2
Iris-virginica,0,0,12


In [13]:
print("From the above Prediction table we can see that\n")
print("Number of correct predictions are 13+5+12 = 30\n")

From the above Prediction table we can see that

Number of correct predictions are 13+5+12 = 30

