Random Forest of decision trees to specify a bird species

In [3]:
import pandas as pd

# Some lines have too many undefined values, this will skip them
imgatt = pd.read_csv(r"C:\Users\marti\Desktop\PythonProjects\MachineLearning-Python\RandomForest\CUB_200_2011\attributes\image_attribute_labels.txt", sep='\s+', header=None, usecols=[0,1,2], names=['imgid', 'attid', 'present'])

imgatt.head()

Unnamed: 0,imgid,attid,present
0,1,1,0
1,1,2,0
2,1,3,0
3,1,4,0
4,1,5,1



------- MTurk image attribute labels (attributes/image_attribute_labels.txt) ------
The set of attribute labels as perceived by MTurkers for each image is contained in the file attributes/image_attribute_labels.txt, with each line corresponding to one image/attribute/worker triplet:

<image_id> <attribute_id> <is_present> <certainty_id> <time>

where <image_id>, <attribute_id>, <certainty_id> correspond to the IDs in images.txt, attributes/attributes.txt, and attributes/certainties.txt respectively.  <is_present> is 0 or 1 (1 denotes that the attribute is present).  <time> denotes the time spent by the MTurker in seconds.

In [4]:
#get number of rows and columns
imgatt.shape

(3677856, 3)

Reorganizing imgatt to have row per imgid, and 312 columns (one column per attribute), with 1/0 in each cell representing if that imgid has that attribute or not

In [5]:
imgatt2 = imgatt.pivot(index='imgid', columns='attid', values='present')
imgatt2.head()

attid,1,2,3,4,5,6,7,8,9,10,...,303,304,305,306,307,308,309,310,311,312
imgid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


Loading the image classes

In [6]:
imglabels = pd.read_csv(r'C:\Users\marti\Desktop\PythonProjects\MachineLearning-Python\RandomForest\CUB_200_2011\image_class_labels.txt', sep=' ', header=None, names=['imgid', 'label'])
imglabels = imglabels.set_index('imgid')
imglabels.head()

Unnamed: 0_level_0,label
imgid,Unnamed: 1_level_1
1,1
2,1
3,1
4,1
5,1


------- Image class labels (image_class_labels.txt) ------
The ground truth class labels (bird species labels) for each image are contained in the file image_class_labels.txt, with each line corresponding to one image:

<image_id> <class_id>

where <image_id> and <class_id> correspond to the IDs in images.txt and classes.txt, respectively.

Now we move the label column to the imgatt2 data frame, then we will shuffle it.

In [7]:
df = imgatt2.join(imglabels)
df = df.sample(frac=1)
df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,304,305,306,307,308,309,310,311,312,label
imgid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4053,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,70
3802,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,66
2219,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,39
947,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,17
3861,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,67


Separate labels from attributes

In [8]:
#Select first 312 rows
df_att = df.iloc[:, :312]
#Select everything after the first 312 rows
df_label = df.iloc[:, 312:]

df_att.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,303,304,305,306,307,308,309,310,311,312
imgid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4053,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3802,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2219,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
947,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3861,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


Separate test set from train set

In [9]:
df_train_att = df_att[:8000]
df_train_label = df_label[:8000]
df_test_att = df_att[8000:]
df_test_label = df_label[8000:]

df_train_label = df_train_label['label']
df_test_label = df_test_label['label']

Prepare the RandomForestClassifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

#Max features show the number of different columns each tree can look at.
clf = RandomForestClassifier(max_features=50, random_state=0, n_estimators=100)

Fit our data to the Random Forest model

In [11]:
clf.fit(df_train_att, df_train_label)

Let's use attributes from the first five rows of the training set

In [12]:
print(clf.predict(df_train_att.head()))

print('The predicted values ware '+str(int(float(clf.score(df_test_att, df_test_label))*100))+r'% correct')

[70 66 39 17 67]
The predicted values ware 44% correct


Create confusion matrix

In [13]:
from sklearn.metrics import confusion_matrix
pred_labels = clf.predict(df_test_att)
cm = confusion_matrix(df_test_label, pred_labels)
cm

array([[ 7,  1,  3, ...,  0,  0,  0],
       [ 0, 12,  1, ...,  0,  1,  0],
       [ 1,  1,  8, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  4,  0,  0],
       [ 0,  0,  0, ...,  1,  8,  0],
       [ 0,  0,  0, ...,  0,  0, 22]], dtype=int64)

Function copied from sklearn documentation to plot matrix

In [14]:
import matplotlib.pyplot as plt
import itertools
import numpy as np

#The following function was proudly stolen from the Wayback archive: https://web.archive.org/web/20180807180209/http://scikit-learn.org:80/stable/auto_examples/model_selection/plot_confusion_matrix.html [Edited]

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    #for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    #    plt.text(j, i, format(cm[i, j], fmt),
    #             horizontalalignment="center",
    #             color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Creating the actual names of the birds list so that we know the species that are being confused for each other.

In [15]:
birds = pd.read_csv(r"C:\Users\marti\Desktop\PythonProjects\MachineLearning-Python\RandomForest\CUB_200_2011\classes.txt", sep='\s+', header=None, usecols=[1], names=['birdname'])
birds = birds['birdname']
birds

0      001.Black_footed_Albatross
1            002.Laysan_Albatross
2             003.Sooty_Albatross
3           004.Groove_billed_Ani
4              005.Crested_Auklet
                  ...            
195                196.House_Wren
196                197.Marsh_Wren
197                 198.Rock_Wren
198               199.Winter_Wren
199       200.Common_Yellowthroat
Name: birdname, Length: 200, dtype: object

Plot the matrix. 

In [32]:
from os import path
if not path.exists('possibleOutput.png'):
    np.set_printoptions(precision=2)
    plt.figure(figsize=(60, 60), dpi=300)
    plot_confusion_matrix(cm, classes=birds, normalize=True)
    plt.savefig('possibleOutput.png')
    plt.close()

Compare SVM model with Tree model with our data

In [17]:
from sklearn import tree
clftree = tree.DecisionTreeClassifier()
clftree.fit(df_train_att, df_train_label)
print('The predicted values of tree model ware '+str(int(float(clftree.score(df_test_att, df_test_label))*100))+r'% correct')

The predicted values of tree model ware 25% correct


In [18]:
from sklearn import svm
clfsvm = svm.SVC()
clfsvm.fit(df_train_att, df_train_label)
print('The predicted values of SVM model ware '+str(int(float(clfsvm.score(df_test_att, df_test_label))*100))+r'% correct')

The predicted values of SVM model ware 47% correct


In [19]:
clf2 = RandomForestClassifier(max_features=50, random_state=0, n_estimators=100)
clf2.fit(df_train_att, df_train_label)
print('The predicted values of Random Forest model ware '+str(int(float(clf2.score(df_test_att, df_test_label))*100))+r'% correct')

The predicted values of Random Forest model ware 44% correct


Let's perform cross-validation to make sure that we split the training test in different ways.

In [20]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, df_train_att, df_train_label, cv=5)
print("Accuracy of Random Forest: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy of Random Forest: 0.44 (+/- 0.03)


In [21]:
scorestree = cross_val_score(clftree, df_train_att, df_train_label, cv=5)
print("Accuracy of Tree: %0.2f (+/- %0.2f)" % (scorestree.mean(), scorestree.std() * 2))

Accuracy of Tree: 0.26 (+/- 0.03)


In [22]:
scoressvm = cross_val_score(clfsvm, df_train_att, df_train_label, cv=5)
print("Accuracy of SVM: %0.2f (+/- %0.2f)" % (scoressvm.mean(), scoressvm.std() * 2))

Accuracy of SVM: 0.47 (+/- 0.02)


Loop through many different parameters in Random Forest and print results

max_features_opts = range(5, 100, 5)
n_estimators_opts = range(10, 300, 20)
rf_params = np.empty((len(max_features_opts) * len(n_estimators_opts), 4), float)
i = 0
for max_features in max_features_opts:
    for n_estimators in n_estimators_opts:
        clf = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators)
        scores = cross_val_score(clf, df_train_att, df_train_label, cv=5)
        rf_params[i, 0] = max_features
        rf_params[i, 1] = n_estimators
        rf_params[i, 2] = scores.mean()
        rf_params[i, 3] = scores.std() * 2
        i += 1
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features, n_estimators, scores.mean(), scores.std() * 2))

Testing

In [23]:
max_features_opts = range(5, 100, 5)
n_estimators_opts = range(10, 300, 20)
rf_params = np.empty((len(max_features_opts) * len(n_estimators_opts), 2), float)
i = 0
for max_features in max_features_opts:
    for n_estimators in n_estimators_opts:
        rf_params[i, 0] = max_features
        rf_params[i, 1] = n_estimators
        i += 1
rf_params = rf_params.tolist()

In [24]:
#rf_params[20].append(111)

In [25]:
for someList in rf_params:
    if len(someList) <= 2:
        pass
    else:
        print(someList)

In [26]:
rf_params_lists = []

for o in range(len(max_features_opts)):
    #print('rf_params['+str((o*15))+':'+str((o+1)*15)+']')
    rf_params_lists.append(rf_params[o*15:(o+1)*15])
#rf_params_lists

In [27]:
def fIt():
    for rows in range(len(rf_params_lists)):
        for row in rf_params_lists[rows]:
            row.append(rows)

fIt()

Define functions for Threading

In [28]:
def get1(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features1=int(row[0])
        n_estimators1=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf1 = RandomForestClassifier(max_features=max_features1, n_estimators=n_estimators1)
        scores1 = cross_val_score(clf1, df_train_att, df_train_label, cv=5)
        row.append(max_features1)
        row.append(n_estimators1)
        row.append(scores1.mean())
        row.append(scores1.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features1, n_estimators1, scores1.mean(), scores1.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get2(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features2=int(row[0])
        n_estimators2=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf2 = RandomForestClassifier(max_features=max_features2, n_estimators=n_estimators2)
        scores2 = cross_val_score(clf2, df_train_att, df_train_label, cv=5)
        row.append(max_features2)
        row.append(n_estimators2)
        row.append(scores2.mean())
        row.append(scores2.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features2, n_estimators2, scores2.mean(), scores2.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get3(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features3=int(row[0])
        n_estimators3=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf3 = RandomForestClassifier(max_features=max_features3, n_estimators=n_estimators3)
        scores3 = cross_val_score(clf3, df_train_att, df_train_label, cv=5)
        row.append(max_features3)
        row.append(n_estimators3)
        row.append(scores3.mean())
        row.append(scores3.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features3, n_estimators3, scores3.mean(), scores3.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get4(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features4=int(row[0])
        n_estimators4=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf4 = RandomForestClassifier(max_features=max_features4, n_estimators=n_estimators4)
        scores4 = cross_val_score(clf4, df_train_att, df_train_label, cv=5)
        row.append(max_features4)
        row.append(n_estimators4)
        row.append(scores4.mean())
        row.append(scores4.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features4, n_estimators4, scores4.mean(), scores4.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get5(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features5=int(row[0])
        n_estimators5=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf5 = RandomForestClassifier(max_features=max_features5, n_estimators=n_estimators5)
        scores5 = cross_val_score(clf5, df_train_att, df_train_label, cv=5)
        row.append(max_features5)
        row.append(n_estimators5)
        row.append(scores5.mean())
        row.append(scores5.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features5, n_estimators5, scores5.mean(), scores5.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get6(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features6=int(row[0])
        n_estimators6=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf6 = RandomForestClassifier(max_features=max_features6, n_estimators=n_estimators6)
        scores6 = cross_val_score(clf6, df_train_att, df_train_label, cv=5)
        row.append(max_features6)
        row.append(n_estimators6)
        row.append(scores6.mean())
        row.append(scores6.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features6, n_estimators6, scores6.mean(), scores6.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get7(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features7=int(row[0])
        n_estimators7=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf7 = RandomForestClassifier(max_features=max_features7, n_estimators=n_estimators7)
        scores7 = cross_val_score(clf7, df_train_att, df_train_label, cv=5)
        row.append(max_features7)
        row.append(n_estimators7)
        row.append(scores7.mean())
        row.append(scores7.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features7, n_estimators7, scores7.mean(), scores7.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get8(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features8=int(row[0])
        n_estimators8=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf8 = RandomForestClassifier(max_features=max_features8, n_estimators=n_estimators8)
        scores8 = cross_val_score(clf8, df_train_att, df_train_label, cv=5)
        row.append(max_features8)
        row.append(n_estimators8)
        row.append(scores8.mean())
        row.append(scores8.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features8, n_estimators8, scores8.mean(), scores8.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get9(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features9=int(row[0])
        n_estimators9=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf9 = RandomForestClassifier(max_features=max_features9, n_estimators=n_estimators9)
        scores9 = cross_val_score(clf9, df_train_att, df_train_label, cv=5)
        row.append(max_features9)
        row.append(n_estimators9)
        row.append(scores9.mean())
        row.append(scores9.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features9, n_estimators9, scores9.mean(), scores9.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get10(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features10=int(row[0])
        n_estimators10=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf10 = RandomForestClassifier(max_features=max_features10, n_estimators=n_estimators10)
        scores10 = cross_val_score(clf10, df_train_att, df_train_label, cv=5)
        row.append(max_features10)
        row.append(n_estimators10)
        row.append(scores10.mean())
        row.append(scores10.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features10, n_estimators10, scores10.mean(), scores10.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get11(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features11=int(row[0])
        n_estimators11=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf11 = RandomForestClassifier(max_features=max_features11, n_estimators=n_estimators11)
        scores11 = cross_val_score(clf11, df_train_att, df_train_label, cv=5)
        row.append(max_features11)
        row.append(n_estimators11)
        row.append(scores11.mean())
        row.append(scores11.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features11, n_estimators11, scores11.mean(), scores11.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get12(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features12=int(row[0])
        n_estimators12=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf12 = RandomForestClassifier(max_features=max_features12, n_estimators=n_estimators12)
        scores12 = cross_val_score(clf12, df_train_att, df_train_label, cv=5)
        row.append(max_features12)
        row.append(n_estimators12)
        row.append(scores12.mean())
        row.append(scores12.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features12, n_estimators12, scores12.mean(), scores12.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get13(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features13=int(row[0])
        n_estimators13=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf13 = RandomForestClassifier(max_features=max_features13, n_estimators=n_estimators13)
        scores13 = cross_val_score(clf13, df_train_att, df_train_label, cv=5)
        row.append(max_features13)
        row.append(n_estimators13)
        row.append(scores13.mean())
        row.append(scores13.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features13, n_estimators13, scores13.mean(), scores13.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get14(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features14=int(row[0])
        n_estimators14=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf14 = RandomForestClassifier(max_features=max_features14, n_estimators=n_estimators14)
        scores14 = cross_val_score(clf14, df_train_att, df_train_label, cv=5)
        row.append(max_features14)
        row.append(n_estimators14)
        row.append(scores14.mean())
        row.append(scores14.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features14, n_estimators14, scores14.mean(), scores14.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get15(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features15=int(row[0])
        n_estimators15=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf15 = RandomForestClassifier(max_features=max_features15, n_estimators=n_estimators15)
        scores15 = cross_val_score(clf15, df_train_att, df_train_label, cv=5)
        row.append(max_features15)
        row.append(n_estimators15)
        row.append(scores15.mean())
        row.append(scores15.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features15, n_estimators15, scores15.mean(), scores15.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get16(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features16=int(row[0])
        n_estimators16=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf16 = RandomForestClassifier(max_features=max_features16, n_estimators=n_estimators16)
        scores16 = cross_val_score(clf16, df_train_att, df_train_label, cv=5)
        row.append(max_features16)
        row.append(n_estimators16)
        row.append(scores16.mean())
        row.append(scores16.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features16, n_estimators16, scores16.mean(), scores16.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get17(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features17=int(row[0])
        n_estimators17=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf17 = RandomForestClassifier(max_features=max_features17, n_estimators=n_estimators17)
        scores17 = cross_val_score(clf17, df_train_att, df_train_label, cv=5)
        row.append(max_features17)
        row.append(n_estimators17)
        row.append(scores17.mean())
        row.append(scores17.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features17, n_estimators17, scores17.mean(), scores17.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get18(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features18=int(row[0])
        n_estimators18=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf18 = RandomForestClassifier(max_features=max_features18, n_estimators=n_estimators18)
        scores18 = cross_val_score(clf18, df_train_att, df_train_label, cv=5)
        row.append(max_features18)
        row.append(n_estimators18)
        row.append(scores18.mean())
        row.append(scores18.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features18, n_estimators18, scores18.mean(), scores18.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

def get19(rfrf):
    global numberOfTestsDone
    for row in rfrf:
        max_features19=int(row[0])
        n_estimators19=int(row[1])
        print("Working on: max_featires={}, n_estimators={}".format(row[0], row[1]))
        clf19 = RandomForestClassifier(max_features=max_features19, n_estimators=n_estimators19)
        scores19 = cross_val_score(clf19, df_train_att, df_train_label, cv=5)
        row.append(max_features19)
        row.append(n_estimators19)
        row.append(scores19.mean())
        row.append(scores19.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features19, n_estimators19, scores19.mean(), scores19.std() * 2))
        numberOfTestsDone += 1
        print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
        print("%")

#get1(rf_params_lists[0])

listOfThreadingFunctions = [get1, get2, get3, get4, get5, get6, get7, get8, get9, get10, get11, get12, get13, get14, get15, get16, get17, get18, get19]

Testing threading

In [29]:
from time import sleep
import threading

def T1(xxx):
    for i in range(0, 10):
        sleep(0.1)
        #print(xxx)
        xxx += 1

def T2(xxx):
    for i in range(0, 10):
        sleep(0.1)
        #print(xxx)
        xxx += 1
thr1 = threading.Thread(target=T1, args=(0, ))
thr2 = threading.Thread(target=T2, args=(0, ))
thr1.start()
sleep(0.12)
thr2.start()
#sleep(10)

In [30]:
def T3(xx):
    sleep(2)
    print(xx)

for i in range(5):
    tttt = threading.Thread(target=T3, args=(2, ))
    tttt.start()
    
tttt.join()


22
2
2
2



Running the whole threading

In [31]:
print(len(rf_params))
print(19*15)

numberOfTestsDone = 286
numberOfTests = len(rf_params)
numberOfTestsDone += 1
print("%0.2f" % ((numberOfTestsDone/numberOfTests)*100), end='')
print("%")

285
285
100.70%


In [47]:
#listOfThreadingFunctions is a list of functions
#rf_params_lists is a list of chunks of parameters
print(len(rf_params_lists))
print(len(listOfThreadingFunctions))
numberOfTestsDone = 0
numberOfTests = len(rf_params)
print(str((numberOfTestsDone/numberOfTests)*100)+"%")
for i in range(len(rf_params_lists)):
     lastThread = threading.Thread(target=listOfThreadingFunctions[i], args=(rf_params_lists[i], ))
     lastThread.start()

lastThread.join()

19
19
0.0%
Working on: max_featires=5.0, n_estimators=10.0
Working on: max_featires=10.0, n_estimators=10.0
Working on: max_featires=15.0, n_estimators=10.0
Working on: max_featires=20.0, n_estimators=10.0
Working on: max_featires=25.0, n_estimators=10.0
Working on: max_featires=30.0, n_estimators=10.0
Working on: max_featires=35.0, n_estimators=10.0
Working on: max_featires=40.0, n_estimators=10.0
Working on: max_featires=45.0, n_estimators=10.0
Working on: max_featires=50.0, n_estimators=10.0
Working on: max_featires=55.0, n_estimators=10.0
Working on: max_featires=60.0, n_estimators=10.0
Working on: max_featires=65.0, n_estimators=10.0
Working on: max_featires=70.0, n_estimators=10.0
Working on: max_featires=75.0, n_estimators=10.0
Working on: max_featires=80.0, n_estimators=10.0
Working on: max_featires=85.0, n_estimators=10.0
Working on: max_featires=90.0, n_estimators=10.0
Working on: max_featires=95.0, n_estimators=10.0
Max features: 5, num estimators: 10, accuracy: 0.26 (+/- 0.

In [33]:
rf_test = rf_params_lists

In [37]:
print(rf_test[0][0])
n_estimators_list = []
max_features_list = []
for Nrows in rf_test:
    print(Nrows)
    for oneRow in Nrows:
        n_estimators_list.append(round(oneRow[0]))
        n_estimators_list.append(round(oneRow[1]))
        print(oneRow)
        break
    break

[5.0, 10.0, 0]
[[5.0, 10.0, 0], [5.0, 30.0, 0], [5.0, 50.0, 0], [5.0, 70.0, 0], [5.0, 90.0, 0], [5.0, 110.0, 0], [5.0, 130.0, 0], [5.0, 150.0, 0], [5.0, 170.0, 0], [5.0, 190.0, 0], [5.0, 210.0, 0], [5.0, 230.0, 0], [5.0, 250.0, 0], [5.0, 270.0, 0], [5.0, 290.0, 0]]
[5.0, 10.0, 0]
