# Train a Random Forest Classifier on the ISOLET Dataset

<b> Download and load the dataset </b>

In [1]:
# import packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# url path
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter04/Dataset/phpB0xrNj.csv'

In [3]:
df = pd.read_csv(file_url)
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f609,f610,f611,f612,f613,f614,f615,f616,f617,class
0,-0.4394,-0.093,0.1718,0.462,0.6226,0.4704,0.3578,0.0478,-0.1184,-0.231,...,0.4102,0.2052,0.3846,0.359,0.5898,0.3334,0.641,0.5898,-0.4872,'1'
1,-0.4348,-0.1198,0.2474,0.4036,0.5026,0.6328,0.4948,0.0338,-0.052,-0.1302,...,0.0,0.2954,0.2046,0.4772,0.0454,0.2046,0.4318,0.4546,-0.091,'1'
2,-0.233,0.2124,0.5014,0.5222,-0.3422,-0.584,-0.7168,-0.6342,-0.8614,-0.8318,...,-0.1112,-0.0476,-0.1746,0.0318,-0.0476,0.1112,0.254,0.1588,-0.4762,'2'
3,-0.3808,-0.0096,0.2602,0.2554,-0.429,-0.6746,-0.6868,-0.665,-0.841,-0.9614,...,-0.0504,-0.036,-0.1224,0.1366,0.295,0.0792,-0.0072,0.0936,-0.151,'2'
4,-0.3412,0.0946,0.6082,0.6216,-0.1622,-0.3784,-0.4324,-0.4358,-0.4966,-0.5406,...,0.1562,0.3124,0.25,-0.0938,0.1562,0.3124,0.3124,0.2188,-0.25,'3'


<b> Extract the response variable</b>

In [4]:
y = df.pop('class')

<b> Split the dataset into training and test sets </b>

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=888)

<b> Create a function that will instantiate and fit a RandomForestClassifier </b>

In [6]:
def rf_model(X, y, random_state=888, n_estimators=10, max_depth=None, min_samples_leaf=1, max_features='sqrt'):
    rf_model = RandomForestClassifier(random_state=random_state,
                                  n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  min_samples_leaf=min_samples_leaf,
                                  max_features=max_features)
    rf_model.fit(X_train, y_train)
    return rf_model

<b> Create a function that will predict the outcome for the training and testing sets </b>

In [7]:
def predictions(model, X_train, X_test):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    return train_preds, test_preds

<b> Create a function that will print the accuracy score for the training and testing sets </b>

In [8]:
def accuracy_total(y_train_default, train_preds_default, y_test_default, test_preds_default):
    train_acc = accuracy_score(y_train_default, train_preds_default)
    test_acc = accuracy_score(y_test_default, test_preds_default)
    print(train_acc)
    print(test_acc)

<b> Train and get the accuracy score for a range of different hyperparameters </b>

Model 1

In [9]:
rf_1 = rf_model(X_train, y_train)
#rf_1.get_params()

In [10]:
train_preds, test_preds = predictions(rf_1, X_train, X_test)

In [11]:
accuracy_total(y_train, train_preds, y_test, test_preds)

0.9981674912955837
0.8935897435897436


Model 2

In [12]:
rf_2 = rf_model(X_train, y_train, n_estimators=20)

In [13]:
train_preds2, test_preds2 = predictions(rf_2, X_train, X_test)

In [14]:
accuracy_total(y_train, train_preds2, y_test, test_preds2)

0.9998167491295583
0.9192307692307692


Model 3

In [15]:
rf_3 = rf_model(X_train, y_train, n_estimators=50)

In [16]:
train_preds3, test_preds3 = predictions(rf_3, X_train, X_test)

In [17]:
accuracy_total(y_train, train_preds3, y_test, test_preds3)

1.0
0.9333333333333333


Model 4

In [18]:
rf_4 = rf_model(X_train, y_train, n_estimators=50, max_depth=5)

In [19]:
train_preds4, test_preds4 = predictions(rf_4, X_train, X_test)

In [20]:
accuracy_total(y_train, train_preds4, y_test, test_preds4)

0.8552318123511087
0.8213675213675213


Model 5

In [21]:
rf_5 = rf_model(X_train, y_train, n_estimators=50, max_depth=10)

In [22]:
train_preds5, test_preds5 = predictions(rf_5, X_train, X_test)

In [23]:
accuracy_total(y_train, train_preds5, y_test, test_preds5)

0.9844236760124611
0.9260683760683761


Model 6

In [24]:
rf_6 = rf_model(X_train, y_train, n_estimators=50, max_depth=10, min_samples_leaf=10)

In [25]:
train_preds6, test_preds6 = predictions(rf_6, X_train, X_test)

In [26]:
accuracy_total(y_train, train_preds6, y_test, test_preds6)

0.9622503206890233
0.9192307692307692


Model 7

In [27]:
rf_7 = rf_model(X_train, y_train, n_estimators=50, max_depth=10, min_samples_leaf=50)

In [28]:
train_preds7, test_preds7 = predictions(rf_7, X_train, X_test)

In [29]:
accuracy_total(y_train, train_preds7, y_test, test_preds7)

0.9184533626534725
0.8940170940170941


Model 8

In [30]:
rf_8 = rf_model(X_train, y_train, n_estimators=50, max_depth=10, min_samples_leaf=50, max_features=0.5)

In [31]:
train_preds8, test_preds8 = predictions(rf_8, X_train, X_test)

In [32]:
accuracy_total(y_train, train_preds8, y_test, test_preds8)

0.8926149899212021
0.867948717948718


Model 9

In [33]:
rf_9 = rf_model(X_train, y_train, n_estimators=50, max_depth=10, min_samples_leaf=50, max_features=0.3)

In [34]:
train_preds9, test_preds9 = predictions(rf_9, X_train, X_test)

In [35]:
accuracy_total(y_train, train_preds9, y_test, test_preds9)

0.9008612790910757
0.8717948717948718


<b> Select the best hyperparameter value </b>

This final set of hyperparameters still doesn't achieve better results than the one we find with n_estimators=50, max_depth=10, min_samples_leaf=50, max_features=0.5.