# config

In [1]:
dConfigActiveLearning = { 'init_oversampling_rate': 8,
                          'init_weak_label_ratio': 0.1,
                              'backup_threshold': 0.2 }

dConfig = { 
            #'skipCells': [''], 
            'skipCells': [ '' ], 
            'keywords': ['Deep Learning'],
            #'threshold': 0.15, #0.3 is better but then no active learning is required
            'query_count': 10,
            'max_annoatations': 100,
            'test_size': 0.2,
            #'datasets': [],
            #'metrics': [] 
            }

def runCell(name):
    return (len(dConfig['skipCells']) == 0) or (np.array(dConfig['skipCells']) == name).sum() == 0

# imports 

In [2]:
import collections.abc
collections.Iterable = collections.abc.Iterable

from sentence_transformers import SentenceTransformer

# import collections.abc
# collections.Iterable = collections.abc.Iterable
from domain_classifier.classifier import CorpusClassifier
from domain_classifier.active_learner import ActiveLearner
from domain_classifier.query_strategy import *

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
# 
import pandas as pd
import numpy as np

import pickle
import os
from pathlib import Path
import json
import time
import sys

import matplotlib.pyplot as plt

#USE ENTIRE SCREEN
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# functions

In [3]:
def read_corpus():
    df_corpus = pd.read_feather('data/given/corpus.feather')
    df_corpus.loc[:,['text']] = (df_corpus['title'] + '. '+ df_corpus['description'])
    df_corpus.drop(['acronym', 'title', 'description'], inplace=True, axis=1)
    return df_corpus
def get_true_annotations(df_dataset,keywords):
    return  df_dataset['text'].str.contains(keywords[0]) * 1 
def get_weak_soft_labels(df_dataset,keywords):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    #embeddings_fname = getPath('embeddings')
    embeddings_fname = Path('data/activeLearning/embeddings/embeddings.pkl')
    embeddings_fname.parent.mkdir(parents=True, exist_ok=True)
    if embeddings_fname.exists():
        with open(embeddings_fname, "rb") as f_in:
            doc_embeddings = pickle.load(f_in)
    else:
        n_docs = len(df_dataset['text'].to_numpy())
        batch_size = 32
        doc_embeddings = model.encode(df_dataset['text'].values[0:n_docs],batch_size=batch_size,show_progress_bar=True)
        with open(embeddings_fname, 'wb') as f_Out:
            print('5')
            pickle.dump(doc_embeddings,f_Out)

    keyword_embeddings = model.encode(keywords)
    distances = cosine_similarity(doc_embeddings, keyword_embeddings)
    return np.mean(distances, axis=1)
def get_weak_labels(df_dataset,keywords,threshold):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    #embeddings_fname = getPath('embeddings')
    embeddings_fname = Path('data/activeLearning/embeddings/embeddings.pkl')
    embeddings_fname.parent.mkdir(parents=True, exist_ok=True)
    if embeddings_fname.exists():
        with open(embeddings_fname, "rb") as f_in:
            doc_embeddings = pickle.load(f_in)
    else:
        n_docs = len(df_dataset['text'].to_numpy())
        batch_size = 32
        doc_embeddings = model.encode(df_dataset['text'].values[0:n_docs],batch_size=batch_size,show_progress_bar=True)
        with open(embeddings_fname, 'wb') as f_Out:
            print('5')
            pickle.dump(doc_embeddings,f_Out)

    keyword_embeddings = model.encode(keywords)
    distances = cosine_similarity(doc_embeddings, keyword_embeddings)
    scores = np.mean(distances, axis=1)
    return (scores>threshold)*1 
def get_is_test(df_dataset,test_size=0.2):
    X_train, X_test = train_test_split( df_dataset, test_size=test_size, random_state=42, stratify = df_dataset['true_annotation'].to_numpy())
    is_test = np.ones(len(df_dataset), dtype=bool)
    is_test[X_train.index] = False
    return is_test
def reduce_data(df_dataset,factor = 0.1):
    X_train, X_test = train_test_split( df_dataset, test_size=factor, random_state=42, stratify = df_dataset['true_annotation'].to_numpy())
    return X_test.reset_index(drop=True)
def oversample_minority_class(df_dataset,oversampling_rate=10,col_label='labels'):
    if oversampling_rate < 0:
        oversampling_rate = 10**10
    iPositiveCount = df_dataset.loc[:][col_label].sum()
    iNegativeCount = len(df_dataset)-iPositiveCount
    iClassCount = int(oversampling_rate * np.min([iPositiveCount,iNegativeCount]))
    iClassCount = int(np.min([iClassCount,np.max([iPositiveCount,iNegativeCount])]))
    oversampling_rate = iClassCount/np.min([iPositiveCount,iNegativeCount])
    
    condition_positive = df_dataset.loc[:][col_label]==1
    condition_negative = df_dataset.loc[:][col_label]==0
    df_positive = df_dataset[condition_positive]
    df_negative = df_dataset[condition_negative]
    
    n_repeat = iClassCount // len(df_positive)
    idx_positive = df_positive.loc[df_positive.index.repeat(n_repeat)].index
    n_sample = np.mod(iClassCount,len(df_positive))
    idx_positive = np.concatenate([idx_positive,df_positive[:n_sample].index])
    #idx_positive = np.concatenate([idx_positive,df_positive.sample(n_sample).index])
    
    n_repeat = iClassCount // len(df_negative)
    idx_negative = df_negative.loc[df_negative.index.repeat(n_repeat)].index
    n_sample = np.mod(iClassCount,len(df_negative))
    idx_negative = np.concatenate([idx_negative,df_negative[:n_sample].index])
    #idx_negative = np.concatenate([idx_negative,df_negative.sample(n_sample).index])
    
    indices = np.hstack([idx_positive,idx_negative])
    return [df_dataset.loc[indices].reset_index(drop=True),oversampling_rate]   #[df_dataset.loc[indices].sample(frac=1).reset_index(drop=True),oversampling_rate]
def save_dataset(df_dataset,path):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    df_dataset.to_csv(path)
def print_positive_negative_labels(ds):
    positive = ds.sum().to_numpy()[0]
    negative = len(ds) - positive
    print(f'Positive/Negative: { positive }/{ negative }')
def read_results():
    cProtocols = []
    pPath = Path(f'data/activeLearning/protocols/')
    for fileName in os.listdir(pPath):
        try:
            path = f'{ pPath }\{ fileName }'
            file = open(path)
            dData = json.load(file)
            cProtocols.append(dData['data'])
        except:
            pass
    return cProtocols

def plot_results(cProtocol=None):
    cProtocols = []
    if(cProtocol == None):
        #read from file system
        pPath = Path(f'data/activeLearning/protocols/')
        for fileName in os.listdir(pPath):
            try:
                path = f'{ pPath }\{ fileName }'
                file = open(path)
                dData = json.load(file)
                cProtocols.append(dData['data'])
            except:
                pass
    else:
        cProtocols = [cProtocol]

     ##############################Evaluation##############################
    for cProtocol in cProtocols:
        anotations = [x1['annotations'] for x1 in cProtocol]
        #f1s = [x1['f1'] for x1 in cProtocol]
        f1s = [float(x1['f1_score']) for x1 in cProtocol]
        #trainIds = np.array([x1['trainId'] for x1 in cProtocol])
        fig = plt.figure(figsize=(30,6))#figsize=(15,4)
        #ax1 = fig.add_subplot(111)
        plt.ylabel('F1 Score', color='blue')
        plt.xlabel('Annotations')
        plt.ylim([0,1.1])
        plt.plot(anotations,f1s)
        plt.xticks(rotation=45)
        plt.legend(['F1 Score'])
        plt.title(f'Annotations vs. F1 Score')
        plt.show()

# Short cut( can be ignored)

In [4]:
# if runCell('Short cut'):
#     path = Path('data/activeLearning/dataset/active_learning.csv')
#     df_dataset = pd.read_csv(path)
#     clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
#     clf.load('pretrained_model.pt')

# prepare data for active learning

In [5]:
# if runCell('prepare data for active learning'):
#     df_corpus = read_corpus()
#     df_dataset = df_corpus
#     df_dataset.insert(2,'true_annotation',get_true_annotations(df_dataset,dConfig['keywords']))    
#     df_dataset.insert(3,'weak_soft_label',get_weak_soft_labels(df_dataset,dConfig['keywords']))
#     df_dataset.insert(4,'is_test',get_is_test(df_dataset,dConfig['test_size']))
#     df_dataset.insert(5,'annotation_idx',-1)
#     save_dataset(df_dataset,'data/activeLearning/dataset/active_learning.csv')
df_dataset = pd.read_csv('data/activeLearning/dataset/active_learning.csv')

# pu training with weak labels (uncommented hence integrated in active learning class)

In [6]:
# if runCell('pu training with weak labels'):
#     clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
#     df_train_org = pd.DataFrame({ 'id': df_dataset['id'],'text': df_dataset['text'],'labels': df_dataset['weak_label'] } )
#     df_train = oversample_minority_class(df_train_org,max_oversampling=1)
#     clf.train_loop(df_train,epochs=1) #trainloader
#     clf.save('pretrained_model.pt')

# preparation for active learning

In [7]:
if runCell('preparation for active learning'):
    df_test = df_dataset[df_dataset['is_test']]
    df_active_learning = df_dataset[df_dataset['is_test'] == False]
    df_test = pd.DataFrame({ 'id': df_test['id'],'text': df_test['text'],'labels': df_test['true_annotation'] } )
    #df_test = oversample_minority_class(df_test,oversampling_rate=1)
    
    queryStrategy = WeakSoftLabelTrustSampling()
    clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
    active_learner = ActiveLearner(clf,queryStrategy,df_active_learning[['id','text','weak_soft_label','annotation_idx']], dConfig = dConfigActiveLearning )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


# active learning loop

In [8]:
if runCell('active learning loop'):
    cProtocol = []
    annotation_count = 0
    while True:
        
        #query new samples by sample strategy
        indices_queried = active_learner.query(dConfig['query_count'])
        
        # abort condition
        if (len(indices_queried) == 0) or (dConfig['max_annoatations'] <= annotation_count):
            break
        annotation_count += dConfig['query_count']
          
        # Simulate user interaction here. Replace this for real-world usage.
        y = df_dataset.loc[indices_queried, ['true_annotation']].to_numpy().flatten()

        active_learner.update(y)
        
        result = active_learner.eval(df_test)
        
        lastProtocol = active_learner.getProtocol()[-1]
        print(f'Evaluation against all test data{lastProtocol}')  
         
    #save_results(cProtocol)
    active_learner.saveProtocol()

Queried new indices[23808 26766 35638 54601 23671 52287 27450 54834 46686 55226] with proba [ 0.6231013   0.6032666   0.6014756   0.5839272   0.5707051  -0.16808182
 -0.17075399 -0.17469488 -0.17804113 -0.19903731]


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.32it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 48.46it/s]


evaluation: {'train_loss': 4.914363773074001, 'eval_loss': 0.4681639354676008, 'tp_cont': 1.9845320582389832, 'tn_cont': 165.54798328876495, 'fp_cont': 0.4520168364979327, 'fn_cont': 0.015467941761016846, 'precision_cont': 0.8144848078053638, 'recall_cont': 0.9922660291189954, 'f1_cont': 0.8946287329736152, 'accuracy_cont': 0.9972173525124381}


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.01it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 48.77it/s]


evaluation: {'train_loss': 5.2516579371877015, 'eval_loss': 0.3746318514458835, 'tp_cont': 1.9883952140808105, 'tn_cont': 165.6374011039734, 'fp_cont': 0.36260068230330944, 'fn_cont': 0.011604785919189453, 'precision_cont': 0.8457671989722132, 'recall_cont': 0.9941976070399081, 'f1_cont': 0.9139954444596303, 'accuracy_cont': 0.9977725865223588}
Current F1 score (0.9139954444596303) > then old f1 score (0.8946287329736152)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.15it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 49.60it/s]


evaluation: {'train_loss': 5.700887253973633, 'eval_loss': 0.30145555222406983, 'tp_cont': 1.9911029934883118, 'tn_cont': 165.7077178955078, 'fp_cont': 0.292284834664315, 'fn_cont': 0.008897006511688232, 'precision_cont': 0.871995098221374, 'recall_cont': 0.9955514967436581, 'f1_cont': 0.9296860678361942, 'accuracy_cont': 0.9982072509745149}
Current F1 score (0.9296860678361942) > then old f1 score (0.9139954444596303)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.18it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 50.27it/s]


evaluation: {'train_loss': 5.904570879531093, 'eval_loss': 0.2411187607795, 'tp_cont': 1.9932582378387451, 'tn_cont': 165.76579535007477, 'fp_cont': 0.23420622292906046, 'fn_cont': 0.006741762161254883, 'precision_cont': 0.8948552369498974, 'recall_cont': 0.9966291189188742, 'f1_cont': 0.9430041370352413, 'accuracy_cont': 0.9985657858164626}
Current F1 score (0.9430041370352413) > then old f1 score (0.9296860678361942)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.16it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 50.00it/s]


evaluation: {'train_loss': 6.2802941562840715, 'eval_loss': 0.19135062652640045, 'tp_cont': 1.9948895573616028, 'tn_cont': 165.81386590003967, 'fp_cont': 0.18613531021401286, 'fn_cont': 0.005110442638397217, 'precision_cont': 0.9146569518843534, 'recall_cont': 0.9974447786803027, 'f1_cont': 0.9542586425783924, 'accuracy_cont': 0.9988616324316982}
Current F1 score (0.9542586425783924) > then old f1 score (0.9430041370352413)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.11it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 50.01it/s]


evaluation: {'train_loss': 6.547715876367874, 'eval_loss': 0.154168083332479, 'tp_cont': 1.9960238337516785, 'tn_cont': 165.84987890720367, 'fp_cont': 0.15011897089425474, 'fn_cont': 0.003976166248321533, 'precision_cont': 0.9300517325453787, 'recall_cont': 0.9980119168753402, 'f1_cont': 0.9628340980021473, 'accuracy_cont': 0.999082767029233}
Current F1 score (0.9628340980021473) > then old f1 score (0.9542586425783924)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.18it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 50.28it/s]


evaluation: {'train_loss': 6.740542922518216, 'eval_loss': 0.12302752491086721, 'tp_cont': 1.9969501495361328, 'tn_cont': 165.8800666332245, 'fp_cont': 0.11993146676104516, 'fn_cont': 0.0030498504638671875, 'precision_cont': 0.9433452178720456, 'recall_cont': 0.9984750747675671, 'f1_cont': 0.9701275555891447, 'accuracy_cont': 0.9992679683415727}
Current F1 score (0.9701275555891447) > then old f1 score (0.9628340980021473)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.12it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 49.51it/s]


evaluation: {'train_loss': 6.940957748156507, 'eval_loss': 0.09748528071213514, 'tp_cont': 1.9976726770401, 'tn_cont': 165.90487551689148, 'fp_cont': 0.09512576006818563, 'fn_cont': 0.0023273229598999023, 'precision_cont': 0.9545461433922993, 'recall_cont': 0.9988363385195506, 'f1_cont': 0.9761891320744008, 'accuracy_cont': 0.999419922129242}
Current F1 score (0.9761891320744008) > then old f1 score (0.9701275555891447)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.09it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 49.11it/s]


evaluation: {'train_loss': 7.190737975470256, 'eval_loss': 0.07801323168678209, 'tp_cont': 1.9981963634490967, 'tn_cont': 165.9238133430481, 'fp_cont': 0.07618373824516311, 'fn_cont': 0.0018036365509033203, 'precision_cont': 0.9632739736637933, 'recall_cont': 0.9990981817240487, 'f1_cont': 0.9808590821539612, 'accuracy_cont': 0.9995357894276728}
Current F1 score (0.9808590821539612) > then old f1 score (0.9761891320744008)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.18it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 49.10it/s]


evaluation: {'train_loss': 7.525473408109974, 'eval_loss': 0.06313866865821183, 'tp_cont': 1.9985911846160889, 'tn_cont': 165.9382756948471, 'fp_cont': 0.061725691077299416, 'fn_cont': 0.0014088153839111328, 'precision_cont': 0.970040680729028, 'recall_cont': 0.9992955923075447, 'f1_cont': 0.9844508425332129, 'accuracy_cont': 0.9996241993694025}
Current F1 score (0.9844508425332129) > then old f1 score (0.9808590821539612)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.09it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 50.16it/s]


evaluation: {'train_loss': 7.63700733454607, 'eval_loss': 0.050032495870254934, 'tp_cont': 1.9989179372787476, 'tn_cont': 165.95105254650116, 'fp_cont': 0.04894946370040998, 'fn_cont': 0.0010820627212524414, 'precision_cont': 0.976097347085079, 'recall_cont': 0.999458968638874, 'f1_cont': 0.9876400283236652, 'accuracy_cont': 0.9997021932986726}
Current F1 score (0.9876400283236652) > then old f1 score (0.9844508425332129)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.15it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 50.19it/s]


evaluation: {'train_loss': 7.852389930412755, 'eval_loss': 0.0396315477846656, 'tp_cont': 1.9991698265075684, 'tn_cont': 165.96120417118073, 'fp_cont': 0.038794060121290386, 'fn_cont': 0.0008301734924316406, 'precision_cont': 0.9809643044330666, 'recall_cont': 0.9995849132532844, 'f1_cont': 0.9901870757812046, 'accuracy_cont': 0.9997641414641019}
Current F1 score (0.9901870757812046) > then old f1 score (0.9876400283236652)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.16it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 50.12it/s]


evaluation: {'train_loss': 7.7479667080478976, 'eval_loss': 0.031350530101917684, 'tp_cont': 1.999358892440796, 'tn_cont': 165.96928787231445, 'fp_cont': 0.03071233059745282, 'fn_cont': 0.0006411075592041016, 'precision_cont': 0.9848713038981592, 'recall_cont': 0.999679446219898, 'f1_cont': 0.9922201280261693, 'accuracy_cont': 0.9998133723921501}
Current F1 score (0.9922201280261693) > then old f1 score (0.9901870757812046)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.13it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 49.58it/s]


evaluation: {'train_loss': 8.22477787223761, 'eval_loss': 0.02533219987526536, 'tp_cont': 1.9995008707046509, 'tn_cont': 165.97517502307892, 'fp_cont': 0.024825322529068217, 'fn_cont': 0.0004991292953491211, 'precision_cont': 0.9877365008598741, 'recall_cont': 0.9997504353518255, 'f1_cont': 0.9937071572699558, 'accuracy_cont': 0.9998492592156409}
Current F1 score (0.9937071572699558) > then old f1 score (0.9922201280261693)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.16it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 49.95it/s]


evaluation: {'train_loss': 8.36656883428077, 'eval_loss': 0.020172691365587525, 'tp_cont': 1.999614953994751, 'tn_cont': 165.98021125793457, 'fp_cont': 0.019790836857282557, 'fn_cont': 0.00038504600524902344, 'precision_cont': 0.9901996731177429, 'recall_cont': 0.9998074769968756, 'f1_cont': 0.9949803816990933, 'accuracy_cont': 0.9998799054606491}
Current F1 score (0.9949803816990933) > then old f1 score (0.9937071572699558)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.15it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 49.96it/s]


evaluation: {'train_loss': 8.55318361488753, 'eval_loss': 0.01600796873390209, 'tp_cont': 1.9997035264968872, 'tn_cont': 165.98427784442902, 'fp_cont': 0.015719167378847487, 'fn_cont': 0.00029647350311279297, 'precision_cont': 0.9922005604940315, 'recall_cont': 0.9998517632479437, 'f1_cont': 0.9960114682545409, 'accuracy_cont': 0.9999046688025784}
Current F1 score (0.9960114682545409) > then old f1 score (0.9949803816990933)=>continue


Train batch: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 14.00it/s]
Eval batch: 100%|████████████████████████████████████████████████████████████████████| 168/168 [00:03<00:00, 50.16it/s]


evaluation: {'train_loss': 6.558876961651549, 'eval_loss': 0.01229062327183783, 'tp_cont': 1.9997800588607788, 'tn_cont': 165.98793876171112, 'fp_cont': 0.012061560788424686, 'fn_cont': 0.0002199411392211914, 'precision_cont': 0.9940047165385106, 'recall_cont': 0.9998900294298894, 'f1_cont': 0.9969386872424141, 'accuracy_cont': 0.9999268958219996}
Current F1 score ({'train_loss': 6.558876961651549, 'eval_loss': 0.01229062327183783, 'tp': 2.0, 'tn': 166.0, 'fp': 0.0, 'fn': 0.0, 'precision': 0.9999999999995, 'recall': 0.9999999999995, 'f1': 0.9999999999989999, 'accuracy': 1.0, 'tp_cont': 1.9997800588607788, 'tn_cont': 165.98793876171112, 'fp_cont': 0.012061560788424686, 'fn_cont': 0.0002199411392211914, 'precision_cont': 0.9940047165385106, 'recall_cont': 0.9998900294298894, 'f1_cont': 0.9969386872424141, 'accuracy_cont': 0.9999268958219996}) is not ( or just slighty ) better than old f1 score (0.9960114682545409)=>stop


Eval batch: 100%|████████████████████████████████████████████████████████████████| 12203/12203 [04:12<00:00, 48.25it/s]


Evaluation against all test data{'annotations': '10', 'f1_score': '0.08241390183267334', 'classifiers': [{'weak_label_threshold': '0.27345593916666666', 'f1_score': '0.9960114682545409'}]}


NameError: name 'dClf' is not defined

# plot it

In [None]:
if runCell('plot it'):
    #RESULT HANDLING
    plot_results() 

In [None]:
# protocol = read_results()[0]
# clf_idx = 0
# active_learning_iterration_idx = 0
# active_learning_iterration = protocol[active_learning_iterration_idx]

# clf_shorts = [ {'f1_score': clf['f1_score'],'is_query_lead': clf['is_query_lead'],'weak_label_threshold': clf['weak_label_threshold'],'weak_label_ratio': clf['weak_label_ratio'],'oversampling_rate': clf['oversampling_rate']} for clf in active_learning_iterration['classifiers']]
# print(active_learning_iterration['f1_score'])
# [print(clf_short) for clf_short in  clf_shorts]

In [None]:
# true_idxs = active_learning_iterration['classifiers'][clf_idx]['true_idxs']
# weak_idxs = active_learning_iterration['classifiers'][clf_idx]['weak_idxs']
# test_idxs = active_learning_iterration['classifiers'][clf_idx]['test_idxs']
# threshold = float(protocol[active_learning_iterration_idx]['classifiers'][clf_idx]['weak_label_threshold'])

# df_true_label = pd.DataFrame(df_dataset.loc[true_idxs][['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
# df_true_label['labels'] = df_true_label['labels'].astype('int')

# df_weak_label = pd.DataFrame(df_dataset.loc[true_idxs][['id','text','weak_soft_label']].to_numpy(),columns=['id','text','labels'])
# df_weak_label.loc[df_weak_label[df_weak_label['labels'] <= threshold].index,['labels']] = 0
# df_weak_label.loc[df_weak_label[df_weak_label['labels'] > threshold].index,['labels']] = 1
# df_weak_label['labels'] = df_weak_label['labels'].astype('int')


# df_test_true_label = pd.DataFrame(df_dataset.loc[test_idxs][['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
# df_test_true_label['labels'] = df_test_true_label['labels'].astype('int')

# df_train_mix = pd.concat([df_true_label,df_weak_label])
# clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
# clf.train_loop(df_train_mix, df_test_true_label)

# result = clf.eval(df_test)
# print(result)

In [None]:
# result = clf.predict_proba(df_test)
# print(result)

In [None]:
# df_test = df_dataset[df_dataset['is_test']]
# df_active_learning = df_dataset[df_dataset['is_test'] == False]
# df_test = pd.DataFrame({ 'id': df_test['id'],'text': df_test['text'],'labels': df_test['true_annotation'] } )
# df_test = oversample_minority_class(df_test,max_oversampling=1)
#result = clf.eval(df_test)

In [None]:
#TEST NO ACTIVE LEARNING? WEAK LABEL THRESHOLD
if 1 == 2:
    amount_weak_true_labels = [30,50,100]
    max_train_data_size = 10000

    df_test = df_dataset[df_dataset['is_test']]
    df_test = pd.DataFrame({ 'id': df_test['id'],'text': df_test['text'],'labels': df_test['true_annotation'] } )

    for amount_weak_true_label in amount_weak_true_labels:
        print(f'##############################START {amount_weak_true_label}##############################')
        df_train = df_dataset[df_dataset['is_test']==False]
        df_train = pd.DataFrame({ 'id': df_train['id'],'text': df_train['text'],'labels': df_train['weak_soft_label'] } )

        threshold = df_train.sort_values(by=['labels'],ascending=False)['labels'].iloc[amount_weak_true_label]
        df_train.loc[df_train[df_train['labels'] > threshold].index,['labels']] = 1
        df_train.loc[df_train[df_train['labels'] <= threshold].index,['labels']] = 0
        df_train['labels'] = df_train['labels'].astype('int')

        check_count = len(df_train[df_train['labels']==1])
        print(f'check:{check_count}')

        oversampling_rate = max_train_data_size// amount_weak_true_label
        #1,10,100,919(max)
        df_train,oversampling_rate = oversample_minority_class(df_train,oversampling_rate=oversampling_rate)
        clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
        clf.train_loop(df_train, df_test)



    len(df_train)

    clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
    clf.train_loop(df_train, df_test)


In [None]:
#TEST NO ACTIVE LEARNING? OVERSAMPLING THRESHOLD
if 1 == 2:
    oversampling_rate = 200
    df_test = df_dataset[df_dataset['is_test']]
    df_test = pd.DataFrame({ 'id': df_test['id'],'text': df_test['text'],'labels': df_test['true_annotation'] } )
    df_train = df_dataset[df_dataset['is_test']==False]
    df_train = pd.DataFrame({ 'id': df_train['id'],'text': df_train['text'],'labels': df_train['true_annotation'] } )
    df_train,oversampling_rate = oversample_minority_class(df_train,oversampling_rate=oversampling_rate)
    len(df_train)
    clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
    clf.train_loop(df_train, df_test)

In [None]:
#TEST NO ACTIVE LEARNING? OVERSAMPLING THRESHOLD
def buildTestSet(df_annotated,df_dataset,threshold):
    n_positive = len(df_dataset[df_dataset['weak_soft_label']>threshold])
    n_negative = len(df_dataset[df_dataset['weak_soft_label']<=threshold])
    negative_ratio = n_negative/n_positive
    
    df_positive = df_annotated[df_annotated['labels']==1]
    df_negative = df_annotated[df_annotated['labels']==0]

    if negative_ratio > 1:
        n = int(len(df_positive) * negative_ratio)
        df_negative_samples = df_negative.sample(n,replace=True)
        df_positive_samples = df_positive
    else:
        n = int(len(df_negative) / negative_ratio)
        df_positive_samples = df_negative.sample(n,replace=True)
        df_negative_samples = df_positive
    return pd.concat([df_positive_samples,df_negative_samples])
    

if 1 == 2:
    #GET MAX FScore after 1. Iterration (base model)
    df_al_test = df_dataset[df_dataset['is_test']]
    df_al_test = pd.DataFrame(df_al_test[['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
    df_al_test['labels'] = df_al_test['labels'].astype('int')

    df_al = df_dataset[df_dataset['is_test']==False].sort_values(by=['weak_soft_label'],ascending=False)
    df_annotation = pd.concat([df_al[:5],df_al[-5:]])
    threshold = np.mean([np.mean(df_annotation[df_annotation['true_annotation']==1]['weak_soft_label']),np.mean(df_annotation[df_annotation['true_annotation']==0]['weak_soft_label'])])
    df_annotation = pd.DataFrame(df_annotation[['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
    df_annotation['labels'] = df_annotation['labels'].astype('int')
    df_test = buildTestSet(df_annotation,df_al,threshold)

    df_train = df_al[5:]
    df_train = df_train[:-5]
    df_train = pd.DataFrame(df_train[['id','text','weak_soft_label']].to_numpy(),columns=['id','text','labels'])
    df_train.loc[df_train[df_train['labels'] > threshold].index,['labels']] = 1
    df_train.loc[df_train[df_train['labels'] <= threshold].index,['labels']] = 0
    df_train['labels'] = df_train['labels'].astype('int')

    df_train,_ = oversample_minority_class(df_train,20)

    clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
    clf.train_loop(df_train, df_test)

In [None]:
#TEST NO ACTIVE LEARNING? MIX WEAK LABELS AND TRUE LABELS
if 1 == 2:
    df_al_test = df_dataset[df_dataset['is_test']]
    df_al_test = pd.DataFrame(df_al_test[['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
    df_al_test['labels'] = df_al_test['labels'].astype('int')

    df_al = df_dataset[df_dataset['is_test']==False].sort_values(by=['weak_soft_label'],ascending=False)
    df_annotation1 = pd.concat([df_al[:5],df_al[-5:]])
    df_annotation = df_annotation1
    threshold = np.mean([np.mean(df_annotation[df_annotation['true_annotation']==1]['weak_soft_label']),np.mean(df_annotation[df_annotation['true_annotation']==0]['weak_soft_label'])])
    df_annotation = pd.DataFrame(df_annotation[['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
    df_annotation['labels'] = df_annotation['labels'].astype('int')

    df_train_true, df_test = train_test_split( df_annotation, test_size=0.5, random_state=42, stratify = df_annotation['labels'].to_numpy())

    df_train_weak = df_al[5:]
    df_train_weak = df_train_weak[:-5]
    df_train_weak = pd.DataFrame(df_train_weak[['id','text','weak_soft_label']].to_numpy(),columns=['id','text','labels'])
    df_train_weak.loc[df_train_weak[df_train_weak['labels'] > threshold].index,['labels']] = 1
    df_train_weak.loc[df_train_weak[df_train_weak['labels'] <= threshold].index,['labels']] = 0
    df_train_weak['labels'] = df_train_weak['labels'].astype('int')
    df_train_weak,_ = oversample_minority_class(df_train_weak,-1)

    df_train_true,_ = oversample_minority_class(df_train_true,-1)
    df_test = buildTestSet(df_test,df_al,threshold)

    cLog = []
    cLog2 = []
    n_true = len(df_train_true)
    clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
    f1_train = clf.train_loop(df_train_true, df_test)
    result = clf.eval(df_al_test)
    cLog.append({**{'n_true_label':str(n_true), 'repeat': str(1), 'n_weak': str(0), 'f1_train': str(f1_train), 'f1_eval': result['f1_cont']},**result})
    cLog2.append({'n_true_label':str(n_true), 'repeat': str(1), 'n_weak': str(0), 'f1_train': str(f1_train), 'f1_eval': result['f1_cont']})
    print(cLog[-1])
    for n_weak in [10,100,1000,10000,100000]:
        n_weak2 = int(np.ceil(n_weak/n_true) * n_true)

        max_repeats = n_weak2 // n_true
        repeats = [1, max_repeats]
        if max_repeats > 2:

            repeats = [1, int(np.sqrt(max_repeats)), max_repeats]

        if n_weak < 100000:
            test_size = n_weak2*1.01 / len(df_train_weak)
            _, df_train_weak_set = train_test_split( df_train_weak, test_size=test_size, random_state=42, stratify = df_train_weak['labels'].to_numpy())
            df_train_weak_set = df_train_weak_set[:n_weak2]
        else:
            df_train_weak_set = df_train_weak

        for repeat in repeats:
            df_train = pd.concat([df_train_true.loc[df_train_true.index.repeat(repeat)],df_train_weak_set])
            clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
            f1_train = clf.train_loop(df_train, df_test)
            result = clf.eval(df_al_test)
            cLog.append({**{'n_true_label':n_true, 'repeat': repeat, 'n_weak': n_weak2, 'f1_train': f1_train, 'f1_eval': result['f1_cont']},**result})
            cLog2.append({'n_true_label':n_true, 'repeat': repeat, 'n_weak': n_weak2, 'f1_train': f1_train, 'f1_eval': result['f1_cont']})
            print(cLog[-1])

    import json
    dProtocol = { 'data': cLog2 }
    protocolPath = Path(f'data/activeLearning/trueweakmix/trueweakmix.json')
    protocolPath.parent.mkdir(parents=True, exist_ok=True)
    with open(protocolPath,'w') as outfile:
        json.dump(dProtocol, outfile)


    cLog = []



In [None]:
#CREATE BASE MODEL
df_al_test = df_dataset[df_dataset['is_test']]
df_al_test = pd.DataFrame(df_al_test[['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
df_al_test['labels'] = df_al_test['labels'].astype('int')

df_al = df_dataset[df_dataset['is_test']==False].sort_values(by=['weak_soft_label'],ascending=False)
df_annotation1 = pd.concat([df_al[:5],df_al[-5:]])
threshold = np.mean([np.mean(df_annotation1[df_annotation1['true_annotation']==1]['weak_soft_label']),np.mean(df_annotation1[df_annotation1['true_annotation']==0]['weak_soft_label'])])
df_annotation1 = pd.DataFrame(df_annotation1[['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
df_annotation1['labels'] = df_annotation1['labels'].astype('int')

df_train_true1, df_test1 = train_test_split( df_annotation1, test_size=0.5, random_state=42, stratify = df_annotation1['labels'].to_numpy())

df_test = df_test1

df_train_weak = df_al[5:]
df_train_weak = df_train_weak[:-5]
df_train_weak = pd.DataFrame(df_train_weak[['id','text','weak_soft_label']].to_numpy(),columns=['id','text','labels'])
df_train_weak.loc[df_train_weak[df_train_weak['labels'] > threshold].index,['labels']] = 1
df_train_weak.loc[df_train_weak[df_train_weak['labels'] <= threshold].index,['labels']] = 0
df_train_weak['labels'] = df_train_weak['labels'].astype('int')
df_train_weak,_ = oversample_minority_class(df_train_weak,-1)

df_train_true1,_ = oversample_minority_class(df_train_true1,-1)
df_test = buildTestSet(df_test,df_al,threshold)

In [None]:
# #TRAIN BASE MODEL WITH TRUE ANNOTATIONS
if 1 == 2:
    clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
    clf.load('base_model.pt')
    result = clf.eval(df_al_test)
    print(f'Result Base Model{result}')
    clf.train_loop(df_train_true1,df_test)
    result = clf.eval(df_al_test)
    print(f'Result theoretically FIRSTROUND Model{result}')
    #clf.save('FIRSTROUND.pt')
#QUERY
if 1 == 2:
    clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
    clf.load('FIRSTROUND.pt')
    result = clf.eval(df_al_test)
    print(f'Result loaded FIRSTROUND Model{result}')
    df_unlabeled = df_al[5:]
    df_unlabeled = df_unlabeled[:-5]
    df_unlabeled = pd.DataFrame(df_unlabeled[['id','text','true_annotation']].to_numpy(),columns=['id','text','labels'])
    df_unlabeled['labels'] = df_unlabeled['labels'].astype('int')
    predictions = clf.predict_proba(df_unlabeled)
    query_idx = np.argsort(np.abs(predictions - 0.5))[:10]
    clf.predict_proba(df_unlabeled.loc[query_idx])
# #SECOND ROUND
if 1 == 2:
    #result = clf.eval(df_al_test)
    #print(result)
    df_annotation2 = df_unlabeled.loc[query_idx]
    df_annotation = df_dataset.loc[np.concatenate([df_al[:5].index,df_al[-5:].index,df_annotation2.index])]
    threshold2 = np.mean([np.mean(df_annotation[df_annotation['true_annotation']==1]['weak_soft_label']),np.mean(df_annotation[df_annotation['true_annotation']==0]['weak_soft_label'])])    

    df_train_true2, df_test2 = train_test_split( df_annotation2, test_size=0.5, random_state=42, stratify = df_annotation1['labels'].to_numpy())

    df_train_true = pd.concat([df_train_true1,df_train_true2])
    df_train_true,_ = oversample_minority_class(df_train_true,-1)

    print(len(df_train_true))

    df_test = pd.concat([df_test1,df_test2])
    df_test = buildTestSet(df_test,df_al,threshold)
    
    clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
    clf.load('FIRSTROUND.pt')
    clf.train_loop(df_train_true,df_test)
    result = clf.eval(df_al_test)
    print(f'SECONDROUND Model after annotation{result}')
if 1 == 2:
    df_train_weak_org = df_dataset.loc[df_train_weak_org.index]
    df_train_weak_org.loc[df_train_weak_org[df_train_weak_org['weak_soft_label'] > threshold2].index,['labels']] = 1
    df_train_weak_org.loc[df_train_weak_org[df_train_weak_org['weak_soft_label'] <= threshold2].index,['labels']] = 0
    df_train_weak_org['labels'] = df_train_weak_org['labels'].astype('int')

    # #&df_dataset['weak_soft_label'] < threshold2
    df_change_train = df_train_weak_org.loc[(df_train_weak_org[df_train_weak_org['weak_soft_label'] > threshold1].index)&(df_train_weak_org[df_train_weak_org['weak_soft_label'] < threshold2].index)]
    print(len(df_train_weak_org))
    print(len(df_change_train))
    
    df0 = df_train_weak_org[df_train_weak_org['labels']==1]
    df1 = df_train_weak_org[df_train_weak_org['labels']==1].sample(4)

    df_change_train = pd.concat([df0,df1,df_change_train])

#     clf = CorpusClassifier(path2transformers=Path('data/activeLearning/models'))
#     clf.load('FIRSTROUND.pt')
    clf.train_loop(df_change_train,df_test)
    result = clf.eval(df_al_test)
    print(f'SECONDROUND Model after threshold change{result}')
