In [None]:
#ALL IMPORTS
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import math
import statistics as stat
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from functools import partial
from modAL.batch import uncertainty_batch_sampling, ranked_batch
from modAL.models import ActiveLearner
from sklearn.metrics import classification_report

from numpy import quantile, where, random
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib as mpl
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [None]:
data = pd.read_csv("movie_metadata.csv", header=0)

In [None]:
data.head()

In [None]:
data.shape

# PREPROCESSING DATASET

Dropping Duplicate Records

In [None]:
data.duplicated().any()

In [None]:
data=data.drop_duplicates()
data.shape

Dropping Correlated Columns

In [None]:
data.drop(["director_name","actor_2_name","genres","movie_title","actor_1_name","actor_3_name","language","country","plot_keywords","movie_imdb_link","cast_total_facebook_likes"],axis=1, inplace = True)

In [None]:
data.shape

HANDLING NULL VALUES

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna()

In [None]:
data.shape

In [None]:
data.isnull().sum()

Converting Categorical Values

In [None]:
data = pd.get_dummies(data, columns=["color"])

In [None]:
data["gross"].value_counts()

Preparing Target Variables for Classification

Converting IMDB SCORE into 3 classes

In [None]:
data["imdb_score"].unique()

In [None]:
data["imdb_score"] = data["imdb_score"].apply(float)

In [None]:
data.loc[data['imdb_score'].between(8,10), 'imdb_score'] = 100.0
data.loc[data['imdb_score'].between(5,7.99), 'imdb_score'] = 50.0
data.loc[data['imdb_score'].between(0,4.992), 'imdb_score'] = 30.0
data["imdb_score"] = data["imdb_score"].apply(str)
data.loc[data['imdb_score'] == "100.0", 'imdb_score'] = "BEST"
data.loc[data['imdb_score'] == "50.0", 'imdb_score'] = "AVERAGE"
data.loc[data['imdb_score'] == "30.0", 'imdb_score'] = "BAD"

In [None]:
data["imdb_score"].value_counts()

Converting Content Ratings into 3 classes

In [None]:
ratings = data["content_rating"].unique()
ratings

In [None]:
for rate in ratings:
    if rate == "M":
        data.loc[data['content_rating'] == rate, 'content_rating'] = "PG"
        #print("PG-13")
    elif rate == "GP":
        data.loc[data['content_rating'] == rate, 'content_rating'] = "PG"
        #print("Others")
    elif rate == "Unrated":
        data.loc[data['content_rating'] == rate, 'content_rating'] = "Not Rated"
        #print("Others")
    elif rate == "Passed":
        data.loc[data['content_rating'] == rate, 'content_rating'] = "Approved"
        #print("Others")
    elif rate == "X":
        data.loc[data['content_rating'] == rate, 'content_rating'] = "NC-17"
        #print("Others")

In [None]:
data["content_rating"].value_counts()

Converting gross into 2 classes

In [None]:
data["gross"].value_counts()

In [None]:
data.loc[data['gross'].between(0,30000000.0), 'gross'] = 0.0
data.loc[data['gross'].between(3000000.01,762000000.0), 'gross'] = 1.0

In [None]:
data["gross"].value_counts()

In [None]:
#data.loc[data['gross'] == 100.0, 'gross'] = "Above Average"
#data.loc[data['gross'] == 30.0, 'gross'] = "Below Average"

In [None]:
#data["gross"].value_counts()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

target1 = le.fit(data["imdb_score"]).transform(data["imdb_score"])
target2 = le.fit(data["content_rating"]).transform(data["content_rating"])
target3 = le.fit(data["gross"]).transform(data["gross"])

In [None]:
X_final = data.drop(["imdb_score", "content_rating", "gross"],axis=1)

In [None]:
scaler = StandardScaler()
scaler.fit(X_final)

In [None]:
def plotGraph(performance,type = 'Accuracy'):
    # Plot our performance over time.
    fig, ax = plt.subplots(figsize=(5, 3), dpi=130)
  
    ax.plot(performance)
    ax.scatter(range(len(performance)), performance, s=13)

    ax.set_title('Incremental classification '+str(type))
    ax.set_xlabel('Query iteration')
    ax.set_ylabel('Classification '+str(type))
    plt.show()

In [None]:
def stratify_samples(X_raw_a, y_raw_1_a, y_raw_2_a, y_raw_3_a):
    
    train_instances = []
    train_instances_y1 = []
    train_instances_y2 = []
    train_instances_y3 = []
    
    #stratifying target 3
    count_a = 0
    count_b = 0
    
    counter_inst = 0
    for q in range(len(y_raw_3_a)):
        if ((y_raw_3_a[q-counter_inst]) == 1) and (count_b<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_b = count_b + 1
            
        elif ((y_raw_3_a[q-counter_inst]) == 0) and (count_a<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_a = count_a + 1
    
    #stratifying target 2
    count_a = 0
    count_b = 0
    count_c = 0
    count_d = 0
    count_e = 0
    count_f = 0
    count_g = 0
    
    counter_inst = 0
    for q in range(len(y_raw_2_a)):
        if ((y_raw_2_a[q-counter_inst]) == 0) and (count_a<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_a = count_a + 1
            
        elif ((y_raw_2_a[q-counter_inst]) == 1) and (count_b<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_b = count_b + 1
            
        elif ((y_raw_2_a[q-counter_inst]) == 2) and (count_c<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_c = count_c + 1
        
        elif ((y_raw_2_a[q-counter_inst]) == 3) and (count_d<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_d = count_d + 1
        
        elif ((y_raw_2_a[q-counter_inst]) == 4) and (count_e<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_e = count_e + 1
            
        elif ((y_raw_2_a[q-counter_inst]) == 5) and (count_f<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_f = count_f + 1
        
        elif ((y_raw_2_a[q-counter_inst]) == 6) and (count_g<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_g = count_g + 1
            
    #stratifying target 1
    count_a = 0
    count_b = 0
    count_c = 0
    
    counter_inst = 0
    for q in range(len(y_raw_1_a)):
        if ((y_raw_2_a[q-counter_inst]) == 0) and (count_a<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_a = count_a + 1
            
        elif ((y_raw_2_a[q-counter_inst]) == 1) and (count_b<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_b = count_b + 1
            
        elif ((y_raw_2_a[q-counter_inst]) == 2) and (count_c<10):
            train_instances.append(X_raw_a[q-counter_inst])
            train_instances_y1.append(y_raw_1_a[q-counter_inst])
            train_instances_y2.append(y_raw_2_a[q-counter_inst])
            train_instances_y3.append(y_raw_3_a[q-counter_inst])
            
            X_raw_a = np.delete(X_raw_a,q-counter_inst,axis=0)
            y_raw_1_a = np.delete(y_raw_1_a,q-counter_inst,axis=0)
            y_raw_2_a = np.delete(y_raw_2_a,q-counter_inst,axis=0)
            y_raw_3_a = np.delete(y_raw_3_a,q-counter_inst,axis=0)
            counter_inst = counter_inst + 1
            count_c = count_c + 1
    
    return train_instances, len(train_instances), train_instances_y1, train_instances_y2, train_instances_y3, X_raw_a, y_raw_1_a, y_raw_2_a, y_raw_3_a

# ACTIVE LEARNING TRAINING

In [None]:
#Declaring batch size and initial raw samples
BATCH_SIZE = 50
N_RAW_SAMPLES= 1000

#Declaring Features and Target variables
y_raw_1 = target1
y_raw_2 = target2
y_raw_3 = target3
X_raw = X_final

print("---------------------------------------")
print("Shape of data set :" + str(X_raw.shape))

# converting data from dataframe to ndarray
#y_raw_1 = y_raw_1.values
#y_raw_2 = y_raw_2.values
#y_raw_3 = y_raw_3.values
X_raw = X_raw.values

# batch_data_continous_X,y stores data and adds to it after every iteration of batch
batch_data_continous_X = np.array([]).reshape(0,X_raw.shape[1])
batch_data_continous_y_1 = np.array([]).reshape(0,1)
batch_data_continous_y_2 = np.array([]).reshape(0,1)
batch_data_continous_y_3 = np.array([]).reshape(0,1)

# Spliting dataset into Train, Pool(for batch) and Test.
print("Preparing Initial Training, Pool and Test sets for initial AL")
print("")

X_train, length_train, y_train_1, y_train_2, y_train_3, X_raw, y_raw_1, y_raw_2, y_raw_3 = stratify_samples(X_raw, y_raw_1, y_raw_2, y_raw_3)
X_train = np.array(X_train)
y_train_1 = np.array(y_train_1)
y_train_2 = np.array(y_train_2)
y_train_3 = np.array(y_train_3)

X_train_temp = X_raw[:884]
y_train_1_temp = y_raw_1[:884]
y_train_2_temp = y_raw_2[:884]
y_train_3_temp = y_raw_3[:884]

#Joining Training Set
X_train = np.concatenate((X_train, X_train_temp), axis=0)
y_train_1 = np.concatenate((y_train_1, y_train_1_temp), axis=0) 
y_train_2 = np.concatenate((y_train_2, y_train_2_temp), axis=0) 
y_train_3 = np.concatenate((y_train_3, y_train_3_temp), axis=0) 

X_pool = X_raw[884:2884]
y_pool_1 = y_raw_1[884:2884]
y_pool_2 = y_raw_2[884:2884]
y_pool_3 = y_raw_3[884:2884]

X_test = X_raw[2884:]
y_test_1 = y_raw_1[2884:]
y_test_2 = y_raw_2[2884:]
y_test_3 = y_raw_3[2884:]

print("Training set size: ")
print("---------------------------")
print("X_train: "+str(len(X_train)))
print("Y_train_1: "+str(len(y_train_1)))
print("Y_train_2: "+str(len(y_train_2)))
print("Y_train_3: "+str(len(y_train_3)))
print("")
print("Pool set size: ")
print("---------------------------")
print("X_pool: "+str(len(X_pool)))
print("Y_pool_1: "+str(len(y_pool_1)))
print("Y_pool_2: "+str(len(y_pool_2)))
print("Y_pool_3: "+str(len(y_pool_3)))
print("")
print("Test set size: ")
print("---------------------------")
print("X_test: "+str(len(X_test)))
print("Y_test_1: "+str(len(y_test_1)))
print("Y_test_2: "+str(len(y_test_2)))
print("Y_test_3: "+str(len(y_test_3)))
print("")

# lists to store all metric values after each iteration
accuracy_history = {1:[], 2:[], 3:[]}
auc_history = {1:[], 2:[], 3:[]}
precision_history = {1:[], 2:[], 3:[]}
recall_history = {1:[], 2:[], 3:[]}
f1_history = {1:[], 2:[], 3:[]}
clf_1 = DecisionTreeClassifier(criterion= 'gini', max_depth= 7, splitter= 'best')
clf_2 = DecisionTreeClassifier(criterion= 'gini', max_depth= 7, splitter= 'best')
clf_3 = DecisionTreeClassifier(criterion= 'gini', max_depth= 7, splitter= 'best')

batch_data_continous_X = np.vstack([batch_data_continous_X, X_train])
batch_data_continous_y_1 = np.append(batch_data_continous_y_1, y_train_1)
batch_data_continous_y_2 = np.append(batch_data_continous_y_2, y_train_2)
batch_data_continous_y_3 = np.append(batch_data_continous_y_3, y_train_3)

# Initial training with trainset
print("========================")
print("Initial Training Results")

#Fitting the model for each label
clf1 = clf_1.fit(X_train, y_train_1)
clf2 = clf_2.fit(X_train, y_train_2)
clf3 = clf_3.fit(X_train, y_train_3)

#Generating predictions for each label
predictions_1 = clf1.predict(X_test)
predictions_2 = clf2.predict(X_test)
predictions_3 = clf3.predict(X_test)

#Generating accuracies for each label
model_accuracy_1 = accuracy_score(y_test_1,predictions_1)
model_accuracy_2 = accuracy_score(y_test_2,predictions_2)
model_accuracy_3 = accuracy_score(y_test_3,predictions_3)

print("========================")
print('Accuracy after Query {n} for Target 1: {acc:0.4f}'.format(n=0, acc=model_accuracy_1))
print('Accuracy after Query {n} for Target 2: {acc:0.4f}'.format(n=0, acc=model_accuracy_2))
print('Accuracy after Query {n} for Target 3: {acc:0.4f}'.format(n=0, acc=model_accuracy_3))

#Appending accuracy score for all labels
accuracy_history[1].append(model_accuracy_1)
accuracy_history[2].append(model_accuracy_2)
accuracy_history[3].append(model_accuracy_3)

#Appending auc score for all labels
#auc_history[1].append(roc_auc_score(y_test_1, predictions_1, multi_class="ovr"))
#auc_history[2].append(roc_auc_score(y_test_2, predictions_2, multi_class="ovr"))
#auc_history[3].append(roc_auc_score(y_test_3, predictions_3, multi_class="ovr"))

#Appending f1 score for all labels
f1_history[1].append(f1_score(y_test_1, predictions_1, average="macro"))
f1_history[2].append(f1_score(y_test_2, predictions_2, average="macro"))
f1_history[3].append(f1_score(y_test_3, predictions_3, average="macro"))

#Appending recall for all labels
recall_history[1].append(recall_score(y_test_1, predictions_1, average="macro"))
recall_history[2].append(recall_score(y_test_2, predictions_2, average="macro"))
recall_history[3].append(recall_score(y_test_3, predictions_3, average="macro"))

#Appending precision score for all labels
precision_history[1].append(precision_score(y_test_1, predictions_1, average="macro"))
precision_history[2].append(precision_score(y_test_2, predictions_2, average="macro"))
precision_history[3].append(precision_score(y_test_3, predictions_3, average="macro"))


In [None]:
def high_probabilities(target_prob):
    prob_arr_temp = [-1,-1,-1,-1,-1,-1,-1]
    prob_array = []
    for i in range(target_prob.shape[0]):
        for j in range(target_prob.shape[1]):
            prob_arr_temp[j] = target_prob[i][j]
        max = np.amax(prob_arr_temp)
        class_index = np.where(prob_arr_temp == max)
        prob_array.append([max,class_index[0][0],i])
        #print([max,class_index[0][0],i])
    return prob_array

In [None]:
def find_min(tar_probs):
    temp_tar_probs = tar_probs
    que_instances = []
    temp_tar_probs = sorted(temp_tar_probs, key=lambda x:x[0])
    count_stop = 1
    for t in range(50):
        if ((count_stop < 16) and (int(temp_tar_probs[t][2]) < len(temp_tar_probs))):
            que_instances.append(int(temp_tar_probs[t][2]))
            count_stop = count_stop + 1
            
    return que_instances

In [None]:
#Obtaining Query Samples
def obt_samples(tar_probs, pool_x, pool_y_1, pool_y_2, pool_y_3, query_indexs, query_samples_xs, query_samples_y_1s, query_samples_y_2s, query_samples_y_3s, tar_probs_2, tar_probs_3):
    q_indic = find_min(tar_probs)
    q_indic = sorted(q_indic, reverse=True)
    for p in range(len(q_indic)):
        q_ind = q_indic[p]
        query_indexs.append(q_ind)

        query_samples_xs.append(pool_x[q_ind])
        query_samples_y_1s.append(pool_y_1[q_ind])
        query_samples_y_2s.append(pool_y_2[q_ind])
        query_samples_y_3s.append(pool_y_3[q_ind])
        
    for p in range(len(q_indic)):
        q_ind = q_indic[p]
        pool_x = np.delete(pool_x,q_ind,axis=0)
        pool_y_1 = np.delete(pool_y_1,q_ind,axis=0)
        pool_y_2 = np.delete(pool_y_2,q_ind,axis=0)
        pool_y_3 = np.delete(pool_y_3,q_ind,axis=0)
        tar_probs = np.delete(tar_probs,q_ind,axis=0)
        tar_probs_2 = np.delete(tar_probs_2,q_ind,axis=0)
        tar_probs_3 = np.delete(tar_probs_3,q_ind,axis=0)
        
    return pool_x, pool_y_1, pool_y_2, pool_y_3, tar_probs, query_indexs, query_samples_xs, query_samples_y_1s, query_samples_y_2s, query_samples_y_3s, tar_probs_2, tar_probs_3

In [None]:
def find_samples(X_pools, Y_pools_1, Y_pools_2, Y_pools_3, tar_1_proba, tar_2_proba, tar_3_proba):
    tar_1_high_probs = high_probabilities(tar_1_proba)
    tar_2_high_probs = high_probabilities(tar_2_proba)
    tar_3_high_probs = high_probabilities(tar_3_proba)
    
    query_index = []
    query_samples_x = []
    query_samples_y_1 = []
    query_samples_y_2 = []
    query_samples_y_3 = []
    
    print("X_pool length before",len(X_pool))
    print("----------------------------------------------------")
    print("Y_pools_1 length before",len(Y_pools_1))
    print("Y_pools_2 length before",len(Y_pools_2))
    print("Y_pools_3 length before",len(Y_pools_3))
    print("----------------------------------------------------")
    print("tar_1_high_probs length before",len(tar_1_high_probs))
    print("tar_2_high_probs length before",len(tar_2_high_probs))
    print("tar_3_high_probs length before",len(tar_3_high_probs))
    print()
    print("----------------------------------------------------")
    print("----------------------------------------------------")
    print()
    
    X_pools, Y_pools_1, Y_pools_2, Y_pools_3, tar_1_high_probs, query_index, query_samples_x, query_samples_y_1, query_samples_y_2, query_samples_y_3, tar_2_high_probs, tar_3_high_probs = obt_samples(tar_1_high_probs, X_pools, Y_pools_1, Y_pools_2, Y_pools_3, query_index, query_samples_x, query_samples_y_1, query_samples_y_2, query_samples_y_3, tar_2_high_probs, tar_3_high_probs)
    X_pools, Y_pools_1, Y_pools_2, Y_pools_3, tar_2_high_probs, query_index, query_samples_x, query_samples_y_1, query_samples_y_2, query_samples_y_3, tar_1_high_probs, tar_3_high_probs = obt_samples(tar_2_high_probs, X_pools, Y_pools_1, Y_pools_2, Y_pools_3, query_index, query_samples_x, query_samples_y_1, query_samples_y_2, query_samples_y_3, tar_1_high_probs, tar_3_high_probs)
    X_pools, Y_pools_1, Y_pools_2, Y_pools_3, tar_3_high_probs, query_index, query_samples_x, query_samples_y_1, query_samples_y_2, query_samples_y_3, tar_1_high_probs, tar_2_high_probs = obt_samples(tar_3_high_probs, X_pools, Y_pools_1, Y_pools_2, Y_pools_3, query_index, query_samples_x, query_samples_y_1, query_samples_y_2, query_samples_y_3, tar_1_high_probs, tar_2_high_probs)
    
    print("X_pool length after",len(X_pools))
    print("----------------------------------------------------")
    print("Y_pools_1 length after",len(Y_pools_1))
    print("Y_pools_2 length after",len(Y_pools_2))
    print("Y_pools_3 length after",len(Y_pools_3))
    print("----------------------------------------------------")
    print("tar_1_high_probs length after",len(tar_1_high_probs))
    print("tar_2_high_probs length after",len(tar_2_high_probs))
    print("tar_3_high_probs length after",len(tar_3_high_probs))
    print("----------------------------------------------------")
    print("Query_Index length",len(query_index))
    print("Query_Samples_Y1 length",len(query_samples_y_1))
    print("Query_Samples_Y2 length",len(query_samples_y_2))
    print("Query_Samples_Y3 length",len(query_samples_y_3))
    
    return X_pools, Y_pools_1, Y_pools_2, Y_pools_3, query_samples_x, query_samples_y_1, query_samples_y_2, query_samples_y_3

In [None]:
#N_QUERIES = int(N_RAW_SAMPLES // BATCH_SIZE)
N_QUERIES = 20
print("========================")
for index in range(N_QUERIES):
    
    tar_1_probas = clf1.predict_proba(X_pool)
    tar_2_probas = clf2.predict_proba(X_pool)
    tar_3_probas = clf3.predict_proba(X_pool)
    print("")
    print("")
    print("---------------------------------------")
    print("Learning phase Query: " + str(index + 1))

    print("Getting pool of data for learner")
      
    #query_index, query_instance = learner.query(X_pool)
    X_pool, y_pool_1, y_pool_2, y_pool_3, X_batch, y_batch_1, y_batch_2, y_batch_3 = find_samples(X_pool, y_pool_1, y_pool_2, y_pool_3, tar_1_probas, tar_2_probas, tar_3_probas)
    
    batch_data_continous_X = np.vstack([batch_data_continous_X, X_batch])
    batch_data_continous_y_1 = np.append(batch_data_continous_y_1, y_batch_1)
    batch_data_continous_y_2 = np.append(batch_data_continous_y_2, y_batch_2)
    batch_data_continous_y_3 = np.append(batch_data_continous_y_3, y_batch_3)
    
    #Fitting the model for each label
    clf1 = clf_1.fit(batch_data_continous_X, batch_data_continous_y_1)
    clf2 = clf_2.fit(batch_data_continous_X, batch_data_continous_y_2)
    clf3 = clf_3.fit(batch_data_continous_X, batch_data_continous_y_3)

    #Generating predictions for each label
    predictions_1 = clf1.predict(X_test)
    predictions_2 = clf2.predict(X_test)
    predictions_3 = clf3.predict(X_test)

    #Generating accuracies for each label
    model_accuracy_1 = accuracy_score(y_test_1,predictions_1)
    model_accuracy_2 = accuracy_score(y_test_2,predictions_2)
    model_accuracy_3 = accuracy_score(y_test_3,predictions_3)

    print("========================")
    print('Accuracy after Query {n} for Target 1: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy_1))
    print('Accuracy after Query {n} for Target 2: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy_2))
    print('Accuracy after Query {n} for Target 3: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy_3))

    #Appending accuracy score for all labels
    accuracy_history[1].append(model_accuracy_1)
    accuracy_history[2].append(model_accuracy_2)
    accuracy_history[3].append(model_accuracy_3)

    #Appending auc score for all labels
    #auc_history[1].append(roc_auc_score(y_test_1, predictions_1, multi_class="ovr"))
    #auc_history[2].append(roc_auc_score(y_test_2, predictions_2, multi_class="ovr"))
    #auc_history[3].append(roc_auc_score(y_test_3, predictions_3, multi_class="ovr"))

    #Appending f1 score for all labels
    f1_history[1].append(f1_score(y_test_1, predictions_1, average="macro"))
    f1_history[2].append(f1_score(y_test_2, predictions_2, average="macro"))
    f1_history[3].append(f1_score(y_test_3, predictions_3, average="macro"))

    #Appending recall for all labels
    recall_history[1].append(recall_score(y_test_1, predictions_1, average="macro"))
    recall_history[2].append(recall_score(y_test_2, predictions_2, average="macro"))
    recall_history[3].append(recall_score(y_test_3, predictions_3, average="macro"))

    #Appending precision score for all labels
    precision_history[1].append(precision_score(y_test_1, predictions_1, average="macro"))
    precision_history[2].append(precision_score(y_test_2, predictions_2, average="macro"))
    precision_history[3].append(precision_score(y_test_3, predictions_3, average="macro"))

In [None]:
# Plot metrics
plotGraph(accuracy_history[1],'Accuracy for Target 1')
plotGraph(accuracy_history[2],'Accuracy for Target 2')
plotGraph(accuracy_history[3],'Accuracy for Target 3')

In [None]:
# Plot metrics
plotGraph(f1_history[1],'F1 Score for Target 1')
plotGraph(f1_history[2],'F1 Score for Target 2')
plotGraph(f1_history[3],'F1 Score for Target 3')