In [73]:
import os
import sys
import pickle
import numpy as np
import pickle
from tqdm import trange
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import xgboost as xgb
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr, kendalltau

In [74]:
NUM_TASKS = 164
#NUM_TASKS = 120

In [83]:
def calc_mtd(dgms):
    dgm0 = dgms[0]
    dgm1 = dgms[1]
    
    mtd0 = np.sum(dgm0[dgm0 < np.inf])
    if dgm1.shape[0]:
        mtd1 = np.sum(dgm1[:, 1] - dgm1[:, 0])
    else:
        mtd1 = 0
    
    return mtd0, mtd1

In [158]:
num_samples_per_task = 5

In [242]:
MISSING_TASKS = {
    'lua' : [32, 38, 50],
    'go' : [12, 32, 38, 50, 90, 103, 125, 128, 137, 162],
    'rs' : [22, 32, 38, 50, 103, 125, 137, 148],
    'java' : [32, 38, 50, 94, 103, 120, 125, 137]}

In [346]:
LANG = 'go'

In [347]:
results = {}
results_code = {}
run_stat = [[] for _ in range(NUM_TASKS)]

RES_DIR = 'he_multiple_res/%s' % LANG

for i in range(NUM_TASKS):
    if i in MISSING_TASKS[LANG]:
        continue

    for num in range(5):
        results1 = pickle.load(open('/%s/%d.%d.pickle' % (RES_DIR, i, num), 'rb'))
        results_code1 = pickle.load(open('/%s/%d.%d_code.pickle' %  (RES_DIR, i, num), 'rb'))

        # prompt_len, correct_prefixes[seq], diag_sum1, diag_sum2, cross_barc(P, G), cross_barc(G, P)       first arg in cross_barcode -> 0 
    
        for k, v in results1.items():
            dgms_a, dgms_b = v[-2]['dgms'], v[-1]['dgms']
            results1[k] = v[:4] + [calc_mtd(dgms_a), calc_mtd(dgms_b)]


        #print('==========')
        #print(results1.keys())
        
        # prompt_len, correct_prefixes[seq], diag_sum1, diag_sum2, (mtd0(P, G), mtd1(P, G)), (mtd0(G, P), mtd1(G, P))       first arg in cross_barcode -> 0 

        code = int(list(results_code1.values())[-1][-1][0] == 'OK')
        run_stat[i].append(code)
        
        results.update(results1)
        results_code.update(results_code1)

KeyboardInterrupt: 

In [176]:
#pickle.dump((results, results_code, run_stat), open('/he_multiple_res/%s.pickle' % LANG, 'wb'))

In [177]:
# LOADING 

In [387]:
LANG = 'lua'

In [388]:
(results, results_code, run_stat) = pickle.load(open('/he_multiple_res/%s.pickle' % LANG, 'rb'))

### Random split

In [389]:
Y = []

for elem in run_stat:
    Y.extend(elem)

In [390]:
# pass@1

np.mean(Y)

0.0

In [352]:
def prepare_features():
    all_features = []

    for task_num in trange(NUM_TASKS):
        features = []
        task_id = task_num

        if task_num in MISSING_TASKS[LANG]:
            continue
        
        for seed in range(1):
            for seq in range(num_samples_per_task):

                # prompt_len, correct_prefixes[seq], diag_sum(prompt), diag_sum(answer), (mtd0(P, G), mtd1(P, G)), (mtd0(G, P), mtd1(G, P))  first arg in cross_barcode -> 0 

                prompt_len = results[(task_id, 0, 0, 0, 0)][0]
                answer_len = results[(task_id, seed, seq, 0, 0)][1] - prompt_len
                f_sample = [prompt_len, 0*answer_len] 
                
                for layer in range(32):
                    for head in range(32):
                        f = results[(task_id, seed, seq, layer, head)]
                        f = [f[2] / prompt_len, f[3] / answer_len, f[4][0] / answer_len, f[4][1] / answer_len, f[5][0] / prompt_len , f[5][1] / prompt_len]
                        f_sample.extend(f)
                        
                all_features.append(f_sample)
    
    return all_features

In [353]:
f_names = ['prompt_len', 'answer_len']
cnt = 2

f = ['prompt_self_att', 'answer_self_att', 'mtd_a_h0', 'mtd_a_h1', 'mtd_b_h0', 'mtd_b_h1']
        
for layer in range(32):
    for head in range(32):
        for f1 in f:
            f_names.append('%d_%s_%d_%d' % (cnt, f1, layer, head))
            cnt += 1

In [354]:
X = prepare_features()
X = np.array(X)
Y = np.array(Y)

100%|██████████| 164/164 [00:09<00:00, 17.75it/s]


In [355]:
X.shape, len(Y)

((780, 6146), 780)

In [356]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [357]:
data = []

def nan2zero(x):
    if np.isnan(x):
        return 0.
    else:
        return x
        
for i in range(X_train.shape[1]):
    #data.append((nan2zero(kendalltau(X_train[:, i], y_train).statistic), i, f_names[i]))
    data.append((roc_auc_score(Y, X[:, i]) - 0.5, i, f_names[i]))

In [358]:
pickle.dump(data, open('he_mupliple_%s_corr.pickle' % LANG, 'wb'))

In [359]:
sorted(data, key = lambda x : -np.abs(x[0]))

[(0.2723371818150673, 5287, '5287_mtd_b_h1_27_16'),
 (-0.2566848812908973, 528, '528_mtd_b_h0_2_23'),
 (-0.25621381342286476, 4482, '4482_mtd_b_h0_23_10'),
 (-0.2518328822501627, 4657, '4657_mtd_b_h1_24_7'),
 (-0.25022696906368835, 504, '504_mtd_b_h0_2_19'),
 (0.24636421254582208, 356, '356_prompt_self_att_1_27'),
 (-0.2456276336976258, 2964, '2964_mtd_b_h0_15_13'),
 (-0.24268560074000478, 289, '289_mtd_b_h1_1_15'),
 (-0.2421203192983658, 5436, '5436_mtd_b_h0_28_9'),
 (-0.23973500291205585, 5394, '5394_mtd_b_h0_28_2'),
 (-0.23858731028812225, 4038, '4038_mtd_b_h0_21_0'),
 (0.2385273561958272, 5101, '5101_mtd_b_h1_26_17'),
 (-0.238227585734352, 3498, '3498_mtd_b_h0_18_6'),
 (0.2369685497961561, 2030, '2030_prompt_self_att_10_18'),
 (-0.23271180924320806, 714, '714_mtd_b_h0_3_22'),
 (-0.23154698687861863, 948, '948_mtd_b_h0_4_29'),
 (-0.23072904176230774, 396, '396_mtd_b_h0_2_1'),
 (-0.23067337010517663, 3408, '3408_mtd_b_h0_17_23'),
 (0.2287634040220632, 1184, '1184_prompt_self_att_6_5'

In [365]:
from sklearn.model_selection import StratifiedGroupKFold

In [367]:
def calc_quality_group(train_idx, test_idx):

    X_train = []
    X_test = []
    y_train = []
    y_test = []
    probs_train = []
    probs_test = []

    test_info = []
    test_task_nums = set()
    
    for i in range(X.shape[0]):
        if i in train_idx:
            X_train.append(X[i])
            y_train.append(Y[i])
            #probs_train.append(all_probs[i])
        else:
            X_test.append(X[i])
            y_test.append(Y[i])
            test_info.append(i // 25)
            #probs_test.append(all_probs[i])
    
    X_train = np.array(X_train[:, [3047]])
    X_test = np.array(X_test[:, [3047]])

    #
    # scaling is optional
    #
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    #X_train = np.array(probs_train).reshape(len(probs_train), 1)
    #X_test = np.array(probs_test).reshape(len(probs_test), 1)

    print(X_train.shape)
    print(X_test.shape)

    clf = xgb.XGBClassifier(tree_method="hist", max_bin = 64, n_estimators = 100, eta = 0.1)
    #clf = MLPClassifier(hidden_layer_sizes=(256, 256, 256), max_iter = 2000, early_stopping = True)
    #clf = LogisticRegression(max_iter=1000)
    #clf = SVC(probability=True)
    #clf = DecisionTreeClassifier(max_depth = 1)
    #thres = get_opt_threshold(X_train, y_train)
    
    clf.fit(X_train, y_train)    

    ##plot_fi(clf)
    
    y_pred = clf.predict_proba(X_test)[:,1]
    y_pred_class = clf.predict(X_test)

    # fixed features
    #y_pred = X_test[:, 2740]
    #y_pred_class = [int(x > 0.5) for x in y_pred]
    
    #y_pred = probs_test
    #y_pred_class = [int(x > 0.5) for x in probs_test]

    #y_pred = X_test[:, 0]
    #y_pred_class = [int(x > thres) for x in X_test[:, 0]]

    #
    #
    #
    task_res = {task_num : [] for task_num in set(test_info)}

    for i in range(len(test_info)):
        task_num = test_info[i]
        p = y_pred[i]
        task_res[task_num].append((p, y_test[i]))

    all_candidates = []

    for task_num in task_res:
        pred_list = task_res[task_num]
        pred_list_sorted = sorted(pred_list, key = lambda x : -x[0])
        #best_candidate = [x[1] for x in pred_list_sorted[0:1]]
    
        all_candidates.append(pred_list_sorted)

    return roc_auc_score(y_test, y_pred), f1_score(y_test, y_pred_class), all_candidates, clf

In [368]:
NUM_TASKS

164

In [369]:
groups = []

for i in range(NUM_TASKS):
    if i in MISSING_TASKS[LANG]:
        continue
    for j in range(5):
        groups.append(i)

kf = StratifiedGroupKFold(n_splits = 5, shuffle = True, random_state = 42)
res_auc = []
res_f1 = []
all_task_res = []
all_clf = []

for train_idx, test_idx in kf.split(range(X.shape[0]), Y, groups):
    #print("%s %s" % (train_idx, test_idx))
    auc, f1, task_res, clf = calc_quality_group(train_idx, test_idx)
    res_auc.append(auc)
    res_f1.append(f1)
    all_task_res.append(task_res)
    all_clf.append(clf)

TypeError: list indices must be integers or slices, not tuple

In [331]:
np.mean(res_auc), np.std(res_auc)

(0.6280132617562924, 0.05888608902722917)

In [332]:
np.mean(res_f1), np.std(res_f1)

(0.2932345745603051, 0.06617267829079845)