# Building a Model for Classifying Animal Type and Assessing Its Performance

In [1]:
# import package
import pandas as pd

In [2]:
# create a url path
url_file = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter04/Dataset/openml_phpZNNasq.csv'

In [3]:
# load the data
df = pd.read_csv(url_file)
df.head()

Unnamed: 0,animal,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,True,False,False,True,False,False,True,True,True,True,False,False,4,False,False,True,mammal
1,antelope,True,False,False,True,False,False,False,True,True,True,False,False,4,True,False,True,mammal
2,bass,False,False,True,False,False,True,True,True,True,False,False,True,0,True,False,False,fish
3,bear,True,False,False,True,False,False,True,True,True,True,False,False,4,False,False,True,mammal
4,boar,True,False,False,True,False,False,True,True,True,True,False,False,4,True,False,True,mammal


In [4]:
# Remove the 'animal' column
df.drop(columns='animal', inplace=True)

In [5]:
# extract the 'type' column
y = df.pop('type')

In [6]:
# split the dataset into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.4, random_state=188)

In [7]:
# instantiate the model
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42, n_estimators=10)

In [8]:
# fit the model
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10, random_state=42)

In [9]:
# making predictions with train set
train_preds = rf_model.predict(X_train)
train_preds

array(['mammal', 'mammal', 'mammal', 'fish', 'mammal', 'insect', 'fish',
       'bird', 'mammal', 'mammal', 'fish', 'bird', 'reptile', 'bird',
       'fish', 'mammal', 'mammal', 'bird', 'bird', 'mammal', 'bird',
       'bird', 'mammal', 'invertebrate', 'reptile', 'invertebrate',
       'fish', 'bird', 'mammal', 'mammal', 'amphibian', 'mammal',
       'invertebrate', 'mammal', 'mammal', 'insect', 'mammal', 'fish',
       'invertebrate', 'mammal', 'invertebrate', 'invertebrate', 'insect',
       'amphibian', 'mammal', 'reptile', 'amphibian', 'invertebrate',
       'mammal', 'fish', 'bird', 'mammal', 'mammal', 'bird', 'mammal',
       'mammal', 'fish', 'mammal', 'bird', 'fish'], dtype=object)

In [10]:
# calculate accuracy score on the train set
from sklearn.metrics import accuracy_score

train_acc = accuracy_score(y_train, train_preds)
print(train_acc)

1.0


In [11]:
# making predictions on the test set
test_preds = rf_model.predict(X_test)
test_preds

array(['insect', 'fish', 'bird', 'invertebrate', 'mammal', 'fish',
       'mammal', 'reptile', 'bird', 'mammal', 'invertebrate', 'mammal',
       'mammal', 'amphibian', 'mammal', 'mammal', 'invertebrate',
       'invertebrate', 'bird', 'mammal', 'mammal', 'mammal', 'mammal',
       'invertebrate', 'fish', 'bird', 'mammal', 'fish', 'mammal',
       'mammal', 'bird', 'bird', 'mammal', 'mammal', 'invertebrate',
       'mammal', 'bird', 'bird', 'invertebrate', 'bird', 'mammal'],
      dtype=object)

In [12]:
# calculate accuracy on the test set
test_acc = accuracy_score(y_test, test_preds)
print(test_acc)

0.8780487804878049


# Tuning n_estimators to Reduce Overfitting

In [13]:
# Instantiate RandomForestClassifier with random_state=42 and n_estimators=1
rf_model = RandomForestClassifier(random_state=42, n_estimators=1)
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1, random_state=42)

In [14]:
# making predictions
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)

In [15]:
# calculate accuracy
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

In [16]:
print(train_acc)
print(test_acc)

0.9166666666666666
0.8048780487804879


In [17]:
# Instantiate another RandomForestClassifier with random_state=42 and n_estimators=30
rf_model2 = RandomForestClassifier(random_state=42, n_estimators=30)
rf_model2.fit(X_train, y_train)

RandomForestClassifier(n_estimators=30, random_state=42)

In [18]:
# making predictions
train_preds2 = rf_model2.predict(X_train)
test_preds2 = rf_model2.predict(X_test)

In [19]:
# calculate accuracy
train_acc2 = accuracy_score(y_train, train_preds2)
test_acc2 = accuracy_score(y_test, test_preds2)

In [20]:
print(train_acc2)
print(test_acc2)

1.0
0.9024390243902439


# Tuning max_depth to Reduce Overfitting

In [21]:
# Instantiate RandomForestClassifier with random_state=42, n_estimators=30, and max_depth=5
rf_model = RandomForestClassifier(random_state=42,
                                 n_estimators=30,
                                 max_depth=5)

In [22]:
# fit the model
rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=30, random_state=42)

In [23]:
# make predictions
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)

In [24]:
# calculate accuracy
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

In [25]:
print(train_acc)
print(test_acc)

1.0
0.9024390243902439


In [26]:
# Instantiate another RandomForestClassifier with random_state=42, n_estimators=30, and max_depth=2
rf_model2 = RandomForestClassifier(random_state=42,
                                   n_estimators=30,
                                   max_depth=2)

In [27]:
# fit the model
rf_model2.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, n_estimators=30, random_state=42)

In [28]:
# make predictions
train_preds2 = rf_model2.predict(X_train)
test_preds2 = rf_model2.predict(X_test)

In [29]:
# calculate accuracy
train_acc2 = accuracy_score(y_train, train_preds2)
test_acc2 = accuracy_score(y_test, test_preds2)

In [30]:
print(train_acc2)
print(test_acc2)

0.9
0.8292682926829268


# Tuning min_samples_leaf

In [31]:
# Instantiate RandomForestClassifier with min_samples_leaf=3
rf_model = RandomForestClassifier(random_state=42,
                                 n_estimators=30,
                                 max_depth=2,
                                 min_samples_leaf=3)

In [32]:
# fit the model
rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, min_samples_leaf=3, n_estimators=30,
                       random_state=42)

In [33]:
# make predictions
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)

In [34]:
# calculate accuracy
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

In [35]:
print(train_acc)
print(test_acc)

0.8333333333333334
0.8048780487804879


In [36]:
# Instantiate another RandomForestClassifier with min_samples_leaf=7
rf_model2 = RandomForestClassifier(random_state=42,
                                  n_estimators=30,
                                  max_depth=2,
                                  min_samples_leaf=7)

In [37]:
# fit the model
rf_model2.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, min_samples_leaf=7, n_estimators=30,
                       random_state=42)

In [38]:
# make predictions
train_preds2 = rf_model2.predict(X_train)
test_preds2 = rf_model2.predict(X_test)

In [39]:
# calculate accuracy
train_acc2 = accuracy_score(y_train, train_preds2)
test_acc2 = accuracy_score(y_test, test_preds2)

In [40]:
print(train_acc2)
print(test_acc2)

0.8
0.8048780487804879


# Tuning max_features

In [41]:
# Instantiate RandomForestClassifier with max_features=10
rf_model = RandomForestClassifier(random_state=42,
                                  n_estimators=30,
                                  max_depth=2,
                                  min_samples_leaf=7,
                                  max_features=10)

In [42]:
# fit the model
rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, max_features=10, min_samples_leaf=7,
                       n_estimators=30, random_state=42)

In [43]:
# make predictions
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)

In [44]:
# calculate accuracy
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

print(train_acc)
print(test_acc)

0.85
0.8048780487804879


In [45]:
# Instantiate another RandomForestClassifier with max_features=0.2
rf_model2 = RandomForestClassifier(random_state=42,
                                   n_estimators=30,
                                   max_depth=2,
                                   min_samples_leaf=7,
                                   max_features=0.2)

In [46]:
# fit the model
rf_model2.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, max_features=0.2, min_samples_leaf=7,
                       n_estimators=30, random_state=42)

In [47]:
# make predictions
train_preds2 = rf_model2.predict(X_train)
test_preds2 = rf_model2.predict(X_test)

In [48]:
# calculate accuracy
train_acc2 = accuracy_score(y_train, train_preds2)
test_acc2 = accuracy_score(y_test, test_preds2)

print(train_acc2)
print(test_acc2)

0.8333333333333334
0.8048780487804879
