In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

In [2]:
# Set random seed
np.random.seed(0)

# 1) Load the data
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
print("Dataset head:")
print(df.head())

Dataset head:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


In [3]:
# 2) Create training and test data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75
train, test = df[df['is_train'] == True], df[df['is_train'] == False]
print(f"Number of observations in the training data: {len(train)}")
print(f"Number of observations in the test data: {len(test)}")

Number of observations in the training data: 118
Number of observations in the test data: 32


In [4]:
# 3) Preprocess the data
features = df.columns[:4]  # Feature columns
y = pd.factorize(train['species'])[0]  # Encode target labels

print("Features:")
print(features)
print("Encoded target:")
print(y)

Features:
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')
Encoded target:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2]


In [5]:
# 4) Train the Random Forest Classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(train[features], y)
print("Random Forest Classifier trained.")

Random Forest Classifier trained.


In [6]:
# 5) Apply the classifier to the test data
preds = clf.predict(test[features])
print("Predictions for the first 10 test observations:")
print(preds[:10])

Predictions for the first 10 test observations:
[0 0 0 0 0 0 0 0 0 0]


In [7]:
# 6) Evaluate the classifier
print("Actual species for the first 5 test observations:")
print(test['species'].head().tolist())
print("Predicted species for the first 5 test observations:")
print(preds[:5])

Actual species for the first 5 test observations:
['setosa', 'setosa', 'setosa', 'setosa', 'setosa']
Predicted species for the first 5 test observations:
[0 0 0 0 0]


In [8]:
# 7) Create a confusion matrix
conf_matrix = pd.crosstab(test['species'], [iris.target_names[p] for p in preds],
                          rownames=['Actual Species'], colnames=['Predicted Species'])
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
Predicted Species  setosa  versicolor  virginica
Actual Species                                  
setosa                 13           0          0
versicolor              0           5          2
virginica               0           0         12


In [9]:
# 8) View feature importance scores
feature_importances = list(zip(features, clf.feature_importances_))
print("Feature importance scores:")
for feature, importance in feature_importances:
    print(f"{feature}: {importance:.4f}")

Feature importance scores:
sepal length (cm): 0.0847
sepal width (cm): 0.0225
petal length (cm): 0.4465
petal width (cm): 0.4463
