In [1]:
import sklearn
import numpy as np
import pickle
import pandas as pd

In [2]:
with open("./df_pickle/df_pool.pkl", "rb") as fp:
    df_pool = pickle.load(fp)

In [3]:
with open("./df_pickle/df_train.pkl", "rb") as fp:
    df_train = pickle.load(fp)

In [4]:
with open("./df_pickle/df_test.pkl", "rb") as fp:
    df_test = pickle.load(fp)

In [5]:
with open("./df_pickle/df_gen.pkl", "rb") as fp:
    df_gen = pickle.load(fp)

In [6]:
from keras.models import Sequential
from keras.layers import Dense
import openai
import time
import os
import re

from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [7]:
class CommitteeMember:
    def __init__(self):
        self.model = Sequential()
        self.model.add(Dense(128, input_dim=768*2, activation='relu'))
        self.model.add(Dense(2, activation='softmax'))

        # Compile the model
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train, epochs=25, batch_size=64)

In [8]:
neural_net = Sequential()
neural_net.add(Dense(256, input_dim=768*2, activation='relu'))
neural_net.add(Dense(256, activation='relu'))
neural_net.add(Dense(2, activation='softmax'))

neural_net.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
trainset = df_pool[df_pool['label'].notna()]['combined_embedding'].values
#pool = df_pool[df_pool['label'].isna()]['combined_embedding']
labels = df_pool[df_pool['label'].notna()]['label'].values
assert len(trainset) == len(labels)

In [10]:
df_test['combined_embedding'] = df_test.apply(lambda x: np.concatenate((x['question_embedding'],x['context_embedding'])), axis=1)
Xtest = df_test['combined_embedding'].values
ytest = df_test['label'].values

In [11]:
df_pool.loc[df_pool[df_pool['label'].isna()]['combined_embedding'].iloc[0:1].index]

Unnamed: 0,question,context,question_embedding,context_embedding,combined_embedding,combined_embedding_str,label
3698,What are ensemble methods and how do they work?,"A subspace of a vector space RM is a line, a ...","[tensor(0.2155), tensor(0.4005), tensor(-0.070...","[tensor(-0.0080), tensor(0.1167), tensor(0.482...","[0.21554105, 0.40045273, -0.070689894, 0.01691...",[ 0.21554105 0.40045273 -0.07068989 ... 0.02...,


In [12]:
def oracle(idxs, df_pool):
    new_labels = []
    for i, idx in enumerate(idxs):
        ctx = df_pool.loc[idx]['context']
        q = df_pool.loc[idx]['question']

        #Check rate limit
        req_per_min += 1
        while req_per_min>=19:
            time_stamp = time.time()
            if int(time.time()-last_time_stamp) > 60:
                last_time_stamp = time_stamp
                req_per_min = 0
            else:
                time.sleep(10)
        
        #Ask the oracle for label for the context and two questions
        try:
            completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            temperature=0,
            messages=[
                {"role":"system", "content":"You are a system designed to label if a list of provided questions can be answered using ANY part of a provided context. You will always only reply in the following format for each question: `label: LABEL. LABEL should be 'y' if a question can be answered using the context and else 'n'"},
                {"role":"user", "content": "CONTEXT: ```The man in the house has a boy named Bob and a red car. He loves ice cream``` QUESTION: ```Is the boy named Jim?``` QUESTION: ```Does the man have a red book?```"},
                {"role":"assistant", "content":"label: y\nlabel: n"},
                {"role": "user", "content": f'CONTEXT: ```{ctx}``` QUESTION: ```{q["question"]}```'}
            ]
            )
        except Exception as e:
            print("OPENAI_ERROR:",str(e))
            continue
        
        #Parse the response to get labels
        res = completion.choices[0].message.content
        labels = re.findall(r'label: ([yn])', res)
        
        #If two labels are not found, assume something went wrong and skip this iteration
        if len(labels) != 1:
            print('[Error] labels not found in response:', res)
            continue
        
        new_labels.append(labels)

    return new_labels

In [13]:
#reset training set and pool
from multiprocessing import Pool
import worker
if __name__ ==  '__main__':
    committee = [worker.CommitteeMember() for _ in range(10)]
    testacc_qbc=[] #this should hold the final accuracies
    #Xtrain = trainset # should be the concatinated embeddings for question and context
    #ytrain = labels
    #Xpool = pool['combined_embedding'].values # unlabeled data in pool
    n_samples = 50 # number of samples to be labeled from pool
    n_add = 10
    #poolidx=np.arange(len(Xpool),dtype=np.int64)
    #poolidx=np.setdiff1d(poolidx,trainset)
    for i in range(n_samples):
        Xtrain = df_pool[df_pool['label'].notna()]['combined_embedding'].values
        ytrain = df_pool[df_pool['label'].notna()]['label'].values
        Xpool = df_pool[df_pool['label'].isna()]['combined_embedding']
        ypool_lab = []

        p = Pool(processes=n_add)
        p.map(worker.task, [(i, Xtrain, ytrain, Xpool, committee_member, ypool_lab) for committee_member in committee])
        p.close()
        p.join()

        #get probability of label for each class based on voting in the committee
        ypool_p = np.mean(np.array(ypool_lab), 0)
        #Refit model in all training data
        neural_net.fit(Xtrain,ytrain)
        testacc_qbc.append((len(Xtrain),neural_net.evaluate(Xtest, ytest)))
        #select sample with maximum disagreement (least confident)
        ypool_p_sort_idx = np.argsort(-ypool_p.max(1))[-n_add:] #least confident
        # make labels for new points
        #ypool = ['y']*10 #None
        new_labels = oracle(ypool_p_sort_idx, df_pool)
        df_pool.loc[Xpool[ypool_p_sort_idx].index]['label'] = new_labels
        #add to training set
        #Xtrain = np.concatenate((Xtrain,Xpool[ypool_p_sort_idx[-n_add:]]))
        #ytrain = np.concatenate((ytrain,ypool[ypool_p_sort_idx[-n_add:]]))
        #remove from pool
        #Xpool = np.delete(Xpool, ypool_p_sort_idx[-n_add:])
        #poolidx=np.setdiff1d(poolidx,poolidx[ypool_p_sort_idx[-n_add:]])
        print('Model: LR, %i samples (QBC)'%(n_samples+i*n_add))

INFO:tensorflow:Assets written to: ram://24518fa5-846a-410e-b6d6-bec7dd323500/assets
INFO:tensorflow:Assets written to: ram://e8e1f1ec-b268-40e7-828c-1e380ad5c76c/assets
INFO:tensorflow:Assets written to: ram://cdef18f2-f0bb-4f77-86c6-210dec3378cc/assets
INFO:tensorflow:Assets written to: ram://0f8579c5-c234-4a40-81f6-5e973d4a6224/assets
INFO:tensorflow:Assets written to: ram://d0d58bf5-7670-4f97-a998-60f10af5faa9/assets
INFO:tensorflow:Assets written to: ram://b122e3e5-a1e3-4869-bb4f-1964880d0270/assets
INFO:tensorflow:Assets written to: ram://5c5c3202-266c-4c9e-aed2-7226037346e2/assets
INFO:tensorflow:Assets written to: ram://913bfd03-9d77-4eda-9e41-9f039172e58a/assets
INFO:tensorflow:Assets written to: ram://4881ee33-33e7-4d06-bd4e-d59f80302584/assets
INFO:tensorflow:Assets written to: ram://587e5328-4d1a-4823-a38f-f485b53b5dc1/assets
