In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from scipy.stats import entropy
from collections import defaultdict
import argparse
from matplotlib import pyplot as plt

## ActiveLearningPipeline Class

In [13]:
class ActiveLearningPipeline:
    def __init__(self, model,
                 available_pool_indices,
                 train_indices,
                 test_indices,
                 selection_criterion,
                 iterations,
                 budget_per_iter,
                 nodes_df_path,
                 subject_mapping_path):
        self.model = model
        self.iterations = iterations
        self.budget_per_iter = budget_per_iter
        self.nodes_df_path = nodes_df_path
        self.available_pool_indices = available_pool_indices
        self.train_indices = train_indices
        self.test_indices = test_indices
        self.selection_criterion = selection_criterion
        self.nodes_df = pd.read_csv(nodes_df_path)
        self.feature_vectors = self._read_feature_vectors()
        self.labels = self._read_labels(subject_mapping_path)
        # TODO: Implement the rest of the constructor and the method run_pipeline (this method should not be called in the constructor, but from outside the class)
        # TODO: You are allowed to add more class methods if needed, but a class method cannot be longer than 15 lines of code
        # TODO: Do not change the constructor signature and the already implemented methods

    def _read_feature_vectors(self):
        """
        Read feature vectors from the nodes dataframe
        :return:
        feature_vectors: numpy array, feature vectors
        """
        feature_vectors_raw = self.nodes_df['features'].apply((lambda x: x.strip('][').split(', ')))
        return np.array([[float(val) for val in feature_vector] for feature_vector in feature_vectors_raw])

    def _read_labels(self, subject_mapping_path):
        """
        Read subjects from the nodes dataframe, and convert them to labels (integers)
        :return:
        labels: numpy array, labels
        """
        with open(subject_mapping_path, 'rb') as f:
            subject_mapping = pickle.load(f)
        labels = self.nodes_df['subject'].apply(lambda x: subject_mapping[x])
        return np.array(labels)

    def run_pipeline(self):
        """
        Run the active learning pipeline
        :return
        accuracy_scores: list, accuracy scores at each iteration
        """
        accuracy_scores = []
        for iteration in range(self.iterations):
            if len(self.train_indices) > 600:
                # raise error if the train set is larger than 600 samples
                raise ValueError('The train set is larger than 600 samples')
            # TODO: Implement the active learning pipeline. 
            # TODO: Do not change the lines that are already implemented here in this method. Only add your code before and after them.
            accuracy = self._evaluate_model(self.model)
            accuracy_scores.append(accuracy)
        return accuracy_scores

    def _evaluate_model(self, trained_model):
        """
        Evaluate the model
        :return:
        accuracy: float, accuracy of the model
        """
        # todo: Make sure that there is no test sample in the train set
        preds = trained_model.predict(self.feature_vectors[self.test_indices])
        return round(np.mean(preds == self.labels[self.test_indices]), 3)

In [14]:
def generate_plot(accuracy_scores_dict):
    """
    Generate a plot
    """
    for criterion, accuracy_scores in accuracy_scores_dict.items():
        plt.plot(range(1, len(accuracy_scores) + 1), accuracy_scores, label=criterion)
    plt.xlabel('Iterations')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [15]:
model = 'RandomForestClassifier' # Choose a model ('RandomForestClassifier', 'SVC', 'LogisticRegression')
with open('indices_dict.pkl', 'rb') as f:
    indices_dict = pickle.load(f)
available_pool_indices = indices_dict['available_pool_indices']
train_indices = indices_dict['train_indices']
test_indices = indices_dict['test_indices']
iterations = 30
budget_per_iter = 20
nodes_df_path = 'nodes.csv'
subject_mapping_path = 'subject_mapping.pkl'

selection_criteria = ['uncertainty', 'random']
accuracy_scores_dict = defaultdict(list)

In [None]:
for criterion in selection_criteria:
    AL_class = ActiveLearningPipeline(model=model,
                                      test_indices=test_indices,
                                      available_pool_indices=available_pool_indices,
                                      train_indices=train_indices,
                                      selection_criterion=criterion,
                                      iterations=iterations,
                                      budget_per_iter=budget_per_iter,
                                      nodes_df_path=nodes_df_path,
                                      subject_mapping_path=subject_mapping_path)
    accuracy_scores_dict[criterion] = AL_class.run_pipeline()
generate_plot(accuracy_scores_dict)