In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import label_binarize

from data_repository import DataRepository
from confident_learning import MyConfidentLearning

# Import data

In [2]:
data_repo = DataRepository("../.env")
# Load unclean current data
X_train, y_train, X_validation, y_validation, X_test, y_test = data_repo.load_current_data(clean_data=False)

In [3]:
train_file_names, validation_file_names, test_file_names = data_repo.load_unclean_file_names()

In [4]:
print(X_train.shape, y_train.shape, X_validation.shape, y_validation.shape)
print(len(train_file_names), len(validation_file_names))

(8649, 9000) (8649,) (1191, 9000) (1191,)
8649 1191


In [5]:
data_repo.count_labels(y_train)

{'error': 1718,
 'normal': 1734,
 'overcurrent': 1709,
 'overheating': 1766,
 'zero': 1722}

__Label encoder__

In [6]:
# Chuyển đổi danh sách labels thành mã số
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_validation = label_encoder.transform(y_validation)
y_test = label_encoder.transform(y_test)

In [7]:
print(list(label_encoder.classes_))
print(label_encoder.transform(list(label_encoder.classes_)))

['error', 'normal', 'overcurrent', 'overheating', 'zero']
[0 1 2 3 4]


# First, we have to check the model's accuracy on the dataset

If the model's underfit, the model won't fit the data, therefore the out-of-sample probality won't describe correctly the data-points

In [None]:
model = XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
train_acc = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=5, verbose=10)
validation_acc = cross_val_score(model, X_validation, y_validation, scoring='accuracy', cv=5, verbose=10)
test_acc = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=5, verbose=10)

In [None]:
print(np.average(train_acc), np.average(test_acc), np.average(validation_acc))

0.9092418685664807 0.8958333333333333 0.9109841426110193


Since the accuracy of the cross validation are reasonable, confident learning can be applied

__Getting out of sample predicted probality__

In [None]:
train_cl = MyConfidentLearning(X=X_train, y=y_train)
train_pred_probs = train_cl.get_out_of_sample_proba()

validation_cl = MyConfidentLearning(X=X_validation, y=y_validation)
validation_pred_probs = validation_cl.get_out_of_sample_proba()

test_cl = MyConfidentLearning(X=X_test, y=y_test)
test_pred_probs = test_cl.get_out_of_sample_proba()

[+] Getting out of sample probality
[-] Finished getting out of sample probality with shape: (8649, 5)
[+] Getting out of sample probality
[-] Finished getting out of sample probality with shape: (1191, 5)
[+] Getting out of sample probality
[-] Finished getting out of sample probality with shape: (2160, 5)


# Computing class thresholds

In [9]:
# should be a numpy array of length 5
train_thresholds = train_cl.compute_class_thresholds()
validation_thresholds = validation_cl.compute_class_thresholds()
test_thresholds = test_cl.compute_class_thresholds()

[+] Computing thresholds
[-] Finished compute thresholds: [0.8616693  0.91322388 0.95346248 0.89964409 0.90142361]
[+] Computing thresholds
[-] Finished compute thresholds: [0.91641633 0.90305904 0.92666287 0.83581653 0.91424131]
[+] Computing thresholds
[-] Finished compute thresholds: [0.84152774 0.92196723 0.92664007 0.82998395 0.90748653]


# Constructing the confident joint

In [13]:
C_train = train_cl.compute_confident_joint()
C_validation = validation_cl.compute_confident_joint()
C_test = test_cl.compute_confident_joint()

[+] Computing confident joint
[-] Finished compute confident joint:
[[1302    0    0   33   63]
 [   4 1587   26   92    0]
 [   0   56 1615    0    0]
 [  45   33    0 1467    0]
 [ 109    0    0    0 1483]]
[+] Computing confident joint
[-] Finished compute confident joint:
[[193   0   0   2   4]
 [  0 223   0  18   0]
 [  0  17 220   0   0]
 [  7  16   0 173   0]
 [  4   0   0   0 212]]
[+] Computing confident joint
[-] Finished compute confident joint:
[[349   0   0  16  15]
 [  0 377   7  18   0]
 [  0  33 421   0   0]
 [ 16  21   0 305   0]
 [ 22   0   0   0 374]]


## Approach 1 - Methods 2: Estimate the labels errors

### Count the number of label issues

Now that we have the confident joint C, we can count the estimated number of label issues in our dataset. Recall that this is the sum of the off-diagonal entries (the cases where we estimate that a label has been flipped).

In [18]:
def caculate_noise_rate(C, no_of_samples):
    num_label_issues = np.sum(C - np.diag(np.diag(C)))
    print(f"Number of label issues: {num_label_issues}")
    print('Estimated noise rate: {:.1f}%'.format(100*num_label_issues / no_of_samples))

In [20]:
caculate_noise_rate(C_train, train_cl.X.shape[0])

Number of label issues: 461
Estimated noise rate: 5.3%


In [21]:
caculate_noise_rate(C_test, test_cl.X.shape[0])

Number of label issues: 148
Estimated noise rate: 6.9%


In [22]:
caculate_noise_rate(C_validation, validation_cl.X.shape[0])

Number of label issues: 68
Estimated noise rate: 5.7%


### Pruning label issues

First indentify the label issues, which are the off-diagonal elements

In [24]:
train_issue_indices = train_cl.find_label_issues()
validation_issue_indices = validation_cl.find_label_issues()
test_issue_indices = test_cl.find_label_issues()

[+] Finding labels issue indeces:
[-] Finished compute confident joint:
Issue indices: 461
[+] Finding labels issue indeces:
[-] Finished compute confident joint:
Issue indices: 68
[+] Finding labels issue indeces:
[-] Finished compute confident joint:
Issue indices: 148


__Check the confident joint again__

In [31]:
# Remove the label errors found by Confident Learning
clean_X_train = np.delete(X_train, train_issue_indices, axis=0) 
clean_y_train = np.delete(y_train, train_issue_indices)
clean_train_file_names = np.delete(train_file_names, train_issue_indices)
clean_train_pred_probs = np.delete(train_cl.pred_probs, train_issue_indices, axis=0)

clean_X_validation = np.delete(X_validation, validation_issue_indices, axis=0) 
clean_y_validation = np.delete(y_validation, validation_issue_indices)
clean_validation_file_names = np.delete(validation_file_names, validation_issue_indices)
clean_validation_pred_probs = np.delete(validation_cl.pred_probs, validation_issue_indices, axis=0)

clean_X_test = np.delete(X_test, test_issue_indices, axis=0) 
clean_y_test = np.delete(y_test, test_issue_indices)
clean_test_file_names = np.delete(test_file_names, test_issue_indices)
clean_test_pred_probs = np.delete(test_cl.pred_probs, test_issue_indices, axis=0)

In [57]:
def compute_confident_joint(pred_probs: np.ndarray, thresholds: np.ndarray, labels: np.ndarray) -> np.ndarray:
    print("[+] Computing confident joint")
    n_examples, n_classes = pred_probs.shape
    confident_joint = np.zeros((n_classes, n_classes), dtype=np.int64)
    for data_idx in range(n_examples):
        i = labels[data_idx]    #y_noise
        j = None                #y_true -> to find
        #Lưu ý điểm mình bị sai: vị trí của chúng không ứng với label
        p_j = -1
        for candidate_j in range(n_classes):
            p = pred_probs[data_idx, candidate_j]
            if p >= thresholds[candidate_j] and p > p_j:
                j = candidate_j
                p_j = p
        if j is not None:
            confident_joint[i][j] += 1
    print("[-] Finished compute confident joint:")
    print(confident_joint)
    return confident_joint

__Check the confident joint again__

In [60]:
print("Training dataset:")
clean_C_train = compute_confident_joint(pred_probs=clean_train_pred_probs, thresholds=train_thresholds, labels=clean_y_train)

print("Validation dataset")
clean_C_validation = compute_confident_joint(pred_probs=clean_validation_pred_probs, thresholds=validation_thresholds, labels=clean_y_validation)

print("Test dataset")
clean_C_test = compute_confident_joint(pred_probs=clean_test_pred_probs, thresholds=test_thresholds, labels=clean_y_test)

Training dataset:
[+] Computing confident joint
[-] Finished compute confident joint:
[[1302    0    0    0    0]
 [   0 1587    0    0    0]
 [   0    0 1615    0    0]
 [   0    0    0 1467    0]
 [   0    0    0    0 1483]]
Validation dataset
[+] Computing confident joint
[-] Finished compute confident joint:
[[193   0   0   0   0]
 [  0 223   0   0   0]
 [  0   0 220   0   0]
 [  0   0   0 173   0]
 [  0   0   0   0 212]]
Test dataset
[+] Computing confident joint
[-] Finished compute confident joint:
[[349   0   0   0   0]
 [  0 377   0   0   0]
 [  0   0 421   0   0]
 [  0   0   0 305   0]
 [  0   0   0   0 374]]


__The results: all the off-diagonal elements are removed!__

Check if their subtraction are corrected

In [37]:
print(clean_X_train.shape, len(clean_y_train), len(clean_train_file_names), clean_train_pred_probs.shape)
train_cl.compute_confident_joint()
print(data_repo.count_labels(train_cl.y))
print(data_repo.count_labels(clean_y_train))

(8188, 9000) 8188 8188 (8188, 5)
[+] Computing confident joint
[-] Finished compute confident joint:
[[1302    0    0   33   63]
 [   4 1587   26   92    0]
 [   0   56 1615    0    0]
 [  45   33    0 1467    0]
 [ 109    0    0    0 1483]]
{0: 1718, 1: 1734, 2: 1709, 3: 1766, 4: 1722}
{0: 1622, 1: 1612, 2: 1653, 3: 1688, 4: 1613}


In [35]:
print(clean_X_validation.shape, len(clean_y_validation), len(clean_validation_file_names), clean_validation_pred_probs.shape)
validation_cl.compute_confident_joint()
print(data_repo.count_labels(validation_cl.y))
print(data_repo.count_labels(clean_y_validation))

(1123, 9000) 1123 1123 (1123, 5)
[+] Computing confident joint
[-] Finished compute confident joint:
[[193   0   0   2   4]
 [  0 223   0  18   0]
 [  0  17 220   0   0]
 [  7  16   0 173   0]
 [  4   0   0   0 212]]
{0: 237, 1: 247, 2: 237, 3: 223, 4: 247}
{0: 231, 1: 229, 2: 220, 3: 200, 4: 243}


In [38]:
print(clean_X_test.shape, len(clean_X_test), len(clean_X_test), clean_test_file_names.shape)
test_cl.compute_confident_joint()
print(data_repo.count_labels(test_cl.y))
print(data_repo.count_labels(clean_y_test))

(2012, 9000) 2012 2012 (2012,)
[+] Computing confident joint
[-] Finished compute confident joint:
[[349   0   0  16  15]
 [  0 377   7  18   0]
 [  0  33 421   0   0]
 [ 16  21   0 305   0]
 [ 22   0   0   0 374]]
{0: 445, 1: 419, 2: 454, 3: 411, 4: 431}
{0: 414, 1: 394, 2: 421, 3: 374, 4: 409}


### Comparing models with different data

In [39]:
model = XGBClassifier(tree_method="hist", enable_categorical=True)
# Train model on original, possibly noisy data.
model.fit(train_cl.X, train_cl.y)
# Evaluate model on test split with ground truth labels.
preds = model.predict(X_validation)
acc_original = accuracy_score(preds, y_validation)
print(f"Accuracy with original data: {round(acc_original*100,1)}%")

Accuracy with original data: 94.0%


In [40]:
clean_model = XGBClassifier(tree_method="hist", enable_categorical=True)
# Train model on original, possibly noisy data.
clean_model.fit(clean_X_train, clean_y_train)
# Evaluate model on test split with ground truth labels.
clean_preds = clean_model.predict(clean_X_validation)
new_acc = accuracy_score(clean_preds, clean_y_validation)
print(f"Accuracy with clean data: {round(new_acc*100,1)}%")

Accuracy with clean data: 95.5%


In [41]:
# Evaluate model on test split with ground truth labels.
preds = model.predict(clean_X_validation)
test_acc = accuracy_score(clean_preds, clean_y_validation)
print(f"Accuracy with original data: {round(test_acc*100,1)}%")

Accuracy with original data: 95.5%


## Write to file

In [42]:
train_ap1 = pd.DataFrame({"files": clean_train_file_names})
validation_ap1 = pd.DataFrame({"files": clean_validation_file_names})
test_ap1 = pd.DataFrame({"files": clean_test_file_names})
print(len(clean_train_file_names), len(clean_test_file_names), len(clean_validation_file_names))

8188 2012 1123


In [43]:
# create a excel writer object
with pd.ExcelWriter("../../data/clean_data/approach1/20240105_clean_data_approach1_method2.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    train_ap1.to_excel(writer, sheet_name="train_dataset", index=False)
    validation_ap1.to_excel(writer, sheet_name="validation_dataset", index=False)
    test_ap1.to_excel(writer, sheet_name="test_dataset", index=False)

In [51]:
train_file_names = np.array(train_file_names)
test_file_names = np.array(test_file_names)
validation_file_names = np.array(validation_file_names)

In [54]:
dirty_train_files = train_file_names[train_issue_indices]
dirty_validation_files = validation_file_names[validation_issue_indices]
dirty_test_files = test_file_names[test_issue_indices]
print(len(dirty_train_files), len(dirty_validation_files), len(dirty_test_files))

461 68 148


In [55]:
dirty_train_ap1 = pd.DataFrame({"files": dirty_train_files})
dirty_validation_ap1 = pd.DataFrame({"files": dirty_validation_files})
dirty_test_ap1 = pd.DataFrame({"files": dirty_test_files})
print(len(dirty_train_files), len(dirty_validation_files), len(dirty_test_files))

461 68 148


In [56]:
# create a excel writer object
with pd.ExcelWriter("../../data/dirty_data/20240105_dirty_data_approach1_method2.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    dirty_train_ap1.to_excel(writer, sheet_name="train_dataset", index=False)
    dirty_validation_ap1.to_excel(writer, sheet_name="validation_dataset", index=False)
    dirty_test_ap1.to_excel(writer, sheet_name="test_dataset", index=False)