In [39]:
import numpy as np
import random
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


from AdjustedRandomForest import train

In [40]:
# CONTROL - NO MISLABELLING
scores_my = []
scores_std = []

for _ in range(10):
    X, y = make_classification(n_samples=3000, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, n_classes=2,  random_state = 1, flip_y=0.0000001)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 1)
    
    rf, corrected_y = train(RandomForestClassifier, X_train, y_train, np.unique(y), 5)
    y_test_pred = rf.predict(X_test)
    scores_my.append(metrics.accuracy_score(y_test, y_test_pred))

    rf = RandomForestClassifier(n_estimators=5, criterion='entropy')
    rf.fit(X_train, y_train)
    y_test_pred = rf.predict(X_test)
    scores_std.append(metrics.accuracy_score(y_test, y_test_pred))

print(f"Test\nIndividual Accuracies: {scores_my}\nAverage Accuracy: {np.mean(scores_my)}\n")
print(f"Control\nIndividual Accuracies: {scores_std}\nAverage Accuracy: {np.mean(scores_std)}")

Test
Individual Accuracies: [0.8949494949494949, 0.8939393939393939, 0.9040404040404041, 0.9040404040404041, 0.9040404040404041, 0.907070707070707, 0.902020202020202, 0.8888888888888888, 0.901010101010101, 0.901010101010101]
Average Accuracy: 0.9001010101010101

Control
Individual Accuracies: [0.9040404040404041, 0.896969696969697, 0.907070707070707, 0.896969696969697, 0.9030303030303031, 0.9030303030303031, 0.897979797979798, 0.906060606060606, 0.897979797979798, 0.8929292929292929]
Average Accuracy: 0.9006060606060604


In [41]:
# MISLABELLING
scores_my = []
scores_std = []
N_CLASSES = 2
MISLABELLING = 0.4

for _ in range(10):
    X, y = make_classification(n_samples=3000, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, n_classes=N_CLASSES, flip_y=0.00000001)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    
    y_mislabelled = np.copy(y_train)

    for i in range(int(len(y_mislabelled) * MISLABELLING)):
        y_mislabelled[i] = (y_mislabelled[i] + random.randint(1, N_CLASSES - 1)) % N_CLASSES

    print(sum(i==j for i, j in zip(y_mislabelled, y_train)))
    np.random.shuffle(y_mislabelled)

    rf, corrected_y = train(RandomForestClassifier, X_train, y_mislabelled, np.unique(y), 5)
    y_test_pred = rf.predict(X_test)
    print(sum(i==j for i, j in zip(y_test_pred, y_train)))
    scores_my.append(metrics.accuracy_score(y_test, y_test_pred))

    rf = RandomForestClassifier(n_estimators=5, criterion='entropy')
    rf.fit(X_train, y_mislabelled)
    y_test_pred = rf.predict(X_test)
    scores_std.append(metrics.accuracy_score(y_test, y_test_pred))

print(f"Test\nIndividual Accuracies: {scores_my}\nAverage Accuracy: {np.mean(scores_my)}\n")
print(f"Control\nIndividual Accuracies: {scores_std}\nAverage Accuracy: {np.mean(scores_std)}")

1206
458
1206
506
1206
501
1206
507
1206
483
1206
511
1206
488
1206
507
1206
495
1206
500
Test
Individual Accuracies: [0.5151515151515151, 0.49292929292929294, 0.4818181818181818, 0.4696969696969697, 0.4676767676767677, 0.5212121212121212, 0.4727272727272727, 0.494949494949495, 0.48787878787878786, 0.4636363636363636]
Average Accuracy: 0.48676767676767685

Control
Individual Accuracies: [0.5040404040404041, 0.5373737373737374, 0.5, 0.4898989898989899, 0.501010101010101, 0.5080808080808081, 0.5080808080808081, 0.4676767676767677, 0.4868686868686869, 0.5222222222222223]
Average Accuracy: 0.5025252525252525
