# Part-B, Active Learning

- used resources:

  1 - https://scikit-learn.org/stable/auto_examples/semi_supervised/plot_label_propagation_digits_active_learning.html#sphx-glr-auto-examples-semi-supervised-plot-label-propagation-digits-active-learning-py

  2 - https://medium.com/@hardik.dave/active-learning-sampling-strategies-f8d8ac7037c8-


- Read the data

In [1]:
# importing the libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from scipy.stats import entropy as entropy_test

# Set the random seed for reproducibility
SEED = 485
np.random.seed(SEED)


# Load the dataset
train_data = pd.read_csv("Dataset-train-vf.csv", index_col="sample")

# Display the first few rows
train_data.head()

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,y
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1353,,1.142857,88,104.85,0.00727,0.443,7.997,6.99,8346.0,3.9,0.032695,0.05,C1,Low
2,1107,,4.857143,306,194.175,0.03778,0.363,34.002,12.945,376.64,11.1,0.210526,3.15,C3,Low
3,984,,6.571429,368,208.575,0.0575,0.356,46.0,13.905,451.54,13.1,0.27193,3.1,C1,Low
4,1107,,4.714286,297,175.725,0.03667,0.354,33.003,11.715,393.76,10.5,0.185008,2.85,C1,Low
5,123,,33.571429,235,225.0,2.35,0.923,235.0,15.0,5805.82,21.7,0.1874,0.4,C4,High


# Preprocess the data
- Remove the columns with more than 50% missing values and impute the rest with mean
- convert the categorical columns into numerical columns
- scale the data

In [2]:
x6_mean = train_data["x6"].mean()
# drop 'x2' column because it has a lot of missing values >89%
train_data = train_data.drop(["x2"], axis=1)

# impute missing values with the mean (x6)
train_data["x6"] = train_data["x6"].fillna(train_data["x6"].mean())


# scale

scaler = StandardScaler()
# scale all except the target column named "y" and the categorical column named "x14"
train_data[train_data.columns.difference(["y", "x14"])] = scaler.fit_transform(
    train_data[train_data.columns.difference(["y", "x14"])]
)

# convert categorical data to numerical data
train_data = pd.get_dummies(train_data, columns=["x14"])

# map the target column to 0 and 1
train_data["y"] = train_data["y"].map({"Low": 0, "High": 1})

display(train_data.head())

Unnamed: 0_level_0,x1,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y,x14_C1,x14_C2,x14_C3,x14_C4
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,-0.103775,-0.646737,-0.174571,-1.558357,-0.429737,-0.051848,-0.512699,-1.558357,0.135166,-0.22063,-0.268636,-0.379805,0,True,False,False,False
2,-0.15338,-0.405987,-0.152026,0.31884,-0.370081,-0.372019,-0.329673,0.31884,-0.098963,-0.183989,-0.189301,-0.095592,0,False,False,True,False
3,-0.178183,-0.294872,-0.145615,0.621461,-0.331523,-0.400034,-0.24523,0.621461,-0.096762,-0.173811,-0.161907,-0.100176,0,True,False,False,False
4,-0.15338,-0.415247,-0.152957,-0.068894,-0.372252,-0.408039,-0.336704,-0.068894,-0.09846,-0.187043,-0.200685,-0.123096,0,True,False,False,False
5,-0.351803,1.455192,-0.159369,0.966638,4.150955,1.869182,1.084974,0.966638,0.060539,-0.130046,-0.199618,-0.347717,1,False,False,False,True


- choose 50 random samples and keep them as labeled data, and the rest as unlabeled data

# Active learning using least confidence

In [3]:
def query_least_confident_samples(
    model: LogisticRegression, X_unlabeled: pd.DataFrame, n_samples: int
) -> np.array:
    """
    Query the least confident samples based on predicted probabilities.

    Parameters:
        model: Trained LogisticRegression model.
        X_unlabeled: pd.DataFrame, unlabeled data.
        n_samples: int, number of samples to query.

    Returns:
        least_confident_indices: np.array, indices of the least confident samples (NOTE: not the sample ID but the index in the DataFrame).
    """
    # calculate probablities
    probas = model.predict_proba(X_unlabeled)

    # Calculate least confidence as 1 - max probability for each sample
    least_confidences = 1 - np.max(probas, axis=1)

    # get the least confident samples
    least_confident_indices = np.argsort(least_confidences)[-n_samples:]

    return least_confident_indices

In [4]:
# choose 50 random samples from the dataset, 40 for training and 10 for validation
# get index of the 50 random samples using pandas sample method

labeled_data_train = train_data.sample(n=40, random_state=SEED)

# 10 random samples for validation (not used in training) to evaluate the model
labeled_data_validation = train_data.drop(labeled_data_train.index).sample(
    n=10, random_state=SEED
)


# unlabeled data is the rest of the data
unlabeled_data = (
    train_data.drop(labeled_data_train.index)
    .drop(labeled_data_validation.index)
    .drop("y", axis=1)
)

display(unlabeled_data.head())

Unnamed: 0_level_0,x1,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14_C1,x14_C2,x14_C3,x14_C4
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,-0.103775,-0.646737,-0.174571,-1.558357,-0.429737,-0.051848,-0.512699,-1.558357,0.135166,-0.22063,-0.268636,-0.379805,True,False,False,False
2,-0.15338,-0.405987,-0.152026,0.31884,-0.370081,-0.372019,-0.329673,0.31884,-0.098963,-0.183989,-0.189301,-0.095592,False,False,True,False
3,-0.178183,-0.294872,-0.145615,0.621461,-0.331523,-0.400034,-0.24523,0.621461,-0.096762,-0.173811,-0.161907,-0.100176,True,False,False,False
4,-0.15338,-0.415247,-0.152957,-0.068894,-0.372252,-0.408039,-0.336704,-0.068894,-0.09846,-0.187043,-0.200685,-0.123096,True,False,False,False
5,-0.351803,1.455192,-0.159369,0.966638,4.150955,1.869182,1.084974,0.966638,0.060539,-0.130046,-0.199618,-0.347717,False,False,False,True


In [5]:
# logistic regression

log_reg = LogisticRegression()
log_reg.fit(labeled_data_train.drop("y", axis=1), labeled_data_train["y"])

# evaluate the model
y_pred = log_reg.predict(labeled_data_validation.drop("y", axis=1))
accuracy = accuracy_score(labeled_data_validation["y"], y_pred)
print(f"Accuracy: {accuracy}")


# query the least confident samples while the accuracy on validation data is less than 0.9
while accuracy < 0.9 and len(unlabeled_data) > 0:
    n_samples = 1
    least_confident_indices = query_least_confident_samples(
        log_reg, unlabeled_data, n_samples
    )

    # get the sample IDs of the least confident samples
    least_confident_samples = unlabeled_data.iloc[least_confident_indices].index

    # get the labels of the least confident samples
    # least_confident_labels = train_data.loc[least_confident_samples]["y"]

    # print(
    #    least_confident_labels
    # )  # Observation: All the least confident samples are of class 1 (High) for now

    # add the least confident samples to the labeled data
    labeled_data_train = pd.concat(
        [labeled_data_train, train_data.loc[least_confident_samples]]
    )

    # remove the least confident samples from the unlabeled data
    unlabeled_data = unlabeled_data.drop(least_confident_samples)

    # retrain the model
    log_reg = LogisticRegression()
    log_reg.fit(labeled_data_train.drop("y", axis=1), labeled_data_train["y"])

    # evaluate the model
    y_pred = log_reg.predict(labeled_data_validation.drop("y", axis=1))
    accuracy = accuracy_score(labeled_data_validation["y"], y_pred)
    print(
        f"Accuracy: {accuracy}, Number of training samples: {len(labeled_data_train)}"
    )

Accuracy: 0.8
Accuracy: 0.8, Number of training samples: 41
Accuracy: 0.8, Number of training samples: 42
Accuracy: 0.7, Number of training samples: 43
Accuracy: 0.7, Number of training samples: 44
Accuracy: 0.6, Number of training samples: 45
Accuracy: 0.5, Number of training samples: 46
Accuracy: 0.6, Number of training samples: 47
Accuracy: 0.6, Number of training samples: 48
Accuracy: 0.6, Number of training samples: 49
Accuracy: 0.6, Number of training samples: 50
Accuracy: 0.7, Number of training samples: 51
Accuracy: 0.7, Number of training samples: 52
Accuracy: 0.7, Number of training samples: 53
Accuracy: 0.8, Number of training samples: 54
Accuracy: 0.8, Number of training samples: 55
Accuracy: 0.8, Number of training samples: 56
Accuracy: 0.8, Number of training samples: 57
Accuracy: 0.8, Number of training samples: 58
Accuracy: 0.8, Number of training samples: 59
Accuracy: 0.7, Number of training samples: 60
Accuracy: 0.8, Number of training samples: 61
Accuracy: 0.8, Numbe

In [6]:
# Evaluation on the test set
test_data = pd.read_csv("Dataset-test-vf.csv", index_col="sample")

# drop 'x2' column because it has a lot of missing values >89%
test_data = test_data.drop(["x2"], axis=1)

# impute missing values with the mean (x6) from the training data
test_data["x6"] = test_data["x6"].fillna(x6_mean)

# scale
test_data[test_data.columns.difference(["y", "x14"])] = scaler.transform(
    test_data[test_data.columns.difference(["y", "x14"])]
)

# convert categorical data to numerical data
test_data = pd.get_dummies(test_data, columns=["x14"])

# map the target column to 0 and 1
test_data["y"] = test_data["y"].map({"Low": 0, "High": 1})

# evaluate the model on the test set
y_pred = log_reg.predict(test_data.drop("y", axis=1))
accuracy = accuracy_score(test_data["y"], y_pred)
print(f"{len(labeled_data_train)} samples were used for training.")
print(f"Accuracy on the test set: {accuracy}")
print(f"Confusion matrix:\n{confusion_matrix(test_data['y'], y_pred)}")
print(f"Classification report:\n{classification_report(test_data['y'], y_pred)}")

260 samples were used for training.
Accuracy on the test set: 0.9037037037037037
Confusion matrix:
[[244   8]
 [ 31 122]]
Classification report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93       252
           1       0.94      0.80      0.86       153

    accuracy                           0.90       405
   macro avg       0.91      0.88      0.89       405
weighted avg       0.91      0.90      0.90       405



# Active learning using entropy

In [7]:
def query_entropy_samples(
    model: LogisticRegression, X_unlabeled: pd.DataFrame, n_samples: int
) -> np.array:
    """
    Query the samples with the highest entropy in predictions.

    Parameters:
        model: Trained LogisticRegression model.
        X_unlabeled: pd.dataframe, feature matrix of unlabeled samples.
        n_samples: int, number of samples to query.

    Returns:
        np.array, indices of the samples with the highest entropy. (NOTE: not the sample ID but the index in the DataFrame).
    """

    # Get predicted probabilities for unlabeled data
    probs = model.predict_proba(X_unlabeled)

    # Calculate entropy for each sample
    entropy = -np.sum(
        probs * np.log(probs + 1e-10), axis=1
    )  # Add 1e-10 to avoid log(0)

    # Get indices of the n_samples with highest entropy
    high_entropy_indices = np.argsort(entropy)[-n_samples:]

    ############################################################
    # Test the implementation
    # use entropy api from scipy.stats.entropy to test the implementation
    entropy = entropy_test(probs.T)
    high_entropy_indices_real = np.argsort(entropy)[-n_samples:]
    assert np.array_equal(high_entropy_indices, high_entropy_indices_real)
    ############################################################

    return high_entropy_indices

In [8]:
# choose 50 random samples from the dataset, 40 for training and 10 for validation

labeled_data_train = train_data.sample(n=40, random_state=SEED)

# 10 random samples for validation (not used in training) to evaluate the model
labeled_data_validation = train_data.drop(labeled_data_train.index).sample(
    n=10, random_state=SEED
)

# unlabeled data is the rest of the data
unlabeled_data = (
    train_data.drop(labeled_data_train.index)
    .drop(labeled_data_validation.index)
    .drop("y", axis=1)
)

display(unlabeled_data.head())

Unnamed: 0_level_0,x1,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14_C1,x14_C2,x14_C3,x14_C4
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,-0.103775,-0.646737,-0.174571,-1.558357,-0.429737,-0.051848,-0.512699,-1.558357,0.135166,-0.22063,-0.268636,-0.379805,True,False,False,False
2,-0.15338,-0.405987,-0.152026,0.31884,-0.370081,-0.372019,-0.329673,0.31884,-0.098963,-0.183989,-0.189301,-0.095592,False,False,True,False
3,-0.178183,-0.294872,-0.145615,0.621461,-0.331523,-0.400034,-0.24523,0.621461,-0.096762,-0.173811,-0.161907,-0.100176,True,False,False,False
4,-0.15338,-0.415247,-0.152957,-0.068894,-0.372252,-0.408039,-0.336704,-0.068894,-0.09846,-0.187043,-0.200685,-0.123096,True,False,False,False
5,-0.351803,1.455192,-0.159369,0.966638,4.150955,1.869182,1.084974,0.966638,0.060539,-0.130046,-0.199618,-0.347717,False,False,False,True


In [9]:
# logistic regression
log_reg = LogisticRegression()
log_reg.fit(labeled_data_train.drop("y", axis=1), labeled_data_train["y"])

# evaluate the model
y_pred = log_reg.predict(labeled_data_validation.drop("y", axis=1))
accuracy = accuracy_score(labeled_data_validation["y"], y_pred)
print(f"Accuracy: {accuracy}")


# query the most entropic samples while the accuracy on validation data is less than 0.9, and retrain the model
while accuracy < 0.9 and len(unlabeled_data) > 0:
    n_samples = 1
    most_entropic_indices = query_entropy_samples(log_reg, unlabeled_data, n_samples)

    # get the sample IDs of the most entropic samples
    most_entropic_samples = unlabeled_data.iloc[most_entropic_indices].index

    # get the labels of the most entropic samples
    # most_entropic_labels = train_data.loc[most_entropic_sample]["y"]
    # print(
    #    most_entropic_labels
    # )  # Observation: All the samples are of class 1 (High)

    # add the most entropic sample to the labeled data
    labeled_data_train = pd.concat(
        [labeled_data_train, train_data.loc[most_entropic_samples]]
    )

    # remove the most entropic samples from the unlabeled data
    unlabeled_data = unlabeled_data.drop(most_entropic_samples)

    # retrain the model
    log_reg = LogisticRegression()
    log_reg.fit(labeled_data_train.drop("y", axis=1), labeled_data_train["y"])

    # evaluate the model
    y_pred = log_reg.predict(labeled_data_validation.drop("y", axis=1))
    accuracy = accuracy_score(labeled_data_validation["y"], y_pred)
    print(
        f"Accuracy: {accuracy}, Number of training samples: {len(labeled_data_train)}"
    )

Accuracy: 0.8
Accuracy: 0.8, Number of training samples: 41
Accuracy: 0.8, Number of training samples: 42
Accuracy: 0.7, Number of training samples: 43
Accuracy: 0.7, Number of training samples: 44
Accuracy: 0.6, Number of training samples: 45
Accuracy: 0.5, Number of training samples: 46
Accuracy: 0.6, Number of training samples: 47
Accuracy: 0.6, Number of training samples: 48
Accuracy: 0.6, Number of training samples: 49
Accuracy: 0.6, Number of training samples: 50
Accuracy: 0.7, Number of training samples: 51
Accuracy: 0.7, Number of training samples: 52
Accuracy: 0.7, Number of training samples: 53
Accuracy: 0.8, Number of training samples: 54
Accuracy: 0.8, Number of training samples: 55
Accuracy: 0.8, Number of training samples: 56
Accuracy: 0.8, Number of training samples: 57
Accuracy: 0.8, Number of training samples: 58
Accuracy: 0.8, Number of training samples: 59
Accuracy: 0.7, Number of training samples: 60
Accuracy: 0.8, Number of training samples: 61
Accuracy: 0.8, Numbe

In [10]:
# Evaluation on the test set
test_data = pd.read_csv("Dataset-test-vf.csv", index_col="sample")

# drop 'x2' column because it has a lot of missing values >89%
test_data = test_data.drop(["x2"], axis=1)

# impute missing values with the mean (x6) from the training data
test_data["x6"] = test_data["x6"].fillna(x6_mean)

# scale
test_data[test_data.columns.difference(["y", "x14"])] = scaler.transform(
    test_data[test_data.columns.difference(["y", "x14"])]
)

# convert categorical data to numerical data
test_data = pd.get_dummies(test_data, columns=["x14"])

# map the target column to 0 and 1
test_data["y"] = test_data["y"].map({"Low": 0, "High": 1})

# evaluate the model on the test set
y_pred = log_reg.predict(test_data.drop("y", axis=1))
accuracy = accuracy_score(test_data["y"], y_pred)
print(f"{len(labeled_data_train)} samples were used for training.")
print(f"Accuracy on the test set: {accuracy}")
print(f"Confusion matrix:\n{confusion_matrix(test_data['y'], y_pred)}")
# recall, precision, f1-score

print(classification_report(test_data["y"], y_pred))

# Gmean
from imblearn.metrics import geometric_mean_score

gmean = geometric_mean_score(test_data["y"], y_pred)
print(f"Gmean: {gmean}")

260 samples were used for training.
Accuracy on the test set: 0.9037037037037037
Confusion matrix:
[[244   8]
 [ 31 122]]
              precision    recall  f1-score   support

           0       0.89      0.97      0.93       252
           1       0.94      0.80      0.86       153

    accuracy                           0.90       405
   macro avg       0.91      0.88      0.89       405
weighted avg       0.91      0.90      0.90       405

Gmean: 0.878676158592934


# Using active learning, we trained the model with two strategies: least confidence and entropy sampling. The test accuracy achieved with these methods was  approximately 90%. In both approaches, the model was able to achieve a similar performance level with a significantly reduced amount of labeled data compared to the fully supervised model (only 260 + 10 (validation) samples were labeled instead of 1000). Astonishing! :))


# This demonstrates the effectiveness of active learning in significantly reducing the amount of labeled data needed to train a model. Additionally, both the entropy-based/least confidence strategies were effective in selecting the most informative samples for labeling, leading to a more efficient learning process.