In [1]:
import os
import sys
import pickle
from human_eval.data import HUMAN_EVAL, read_problems, stream_jsonl, write_jsonl
import numpy as np
import pickle
from tqdm import trange
from sklearn.model_selection import KFold, StratifiedGroupKFold
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from scipy.stats import spearmanr, kendalltau

In [2]:
NUM_TASKS = 510 

In [3]:
def calc_mtd(dgms):
    dgm0 = dgms[0]
    dgm1 = dgms[1]
    
    mtd0 = np.sum(dgm0[dgm0 < np.inf])
    if dgm1.shape[0]:
        mtd1 = np.sum(dgm1[:, 1] - dgm1[:, 0])
    else:
        mtd1 = 0
    
    return mtd0, mtd1

In [4]:
(results, results_code) = pickle.load(open('/mbpp_results2/all.pickle', 'rb'))

In [5]:
#pickle.dump((results, results_code), open('/mbpp_results2/all.pickle', 'wb'))

results = {}
results_code = {}

for i in trange(10, 510):
    results1, results_code1 = pickle.load(open('/mbpp_results2/%d.pickle' % i, 'rb')), pickle.load(open('/mbpp_results2/%d_code.pickle' % i, 'rb'))

    for k, v in results1.items():
        dgms_a, dgms_b = v[-2]['dgms'], v[-1]['dgms']
        results1[k] = v[:4] + [calc_mtd(dgms_a), calc_mtd(dgms_b)]

    results.update(results1)
    results_code.update(results_code1)

In [5]:
samples = []
num_samples_per_task = 1

for task_num in trange(10, 510):        
    task_id = 'MBPP/%d' % task_num
    samples.append(dict(task_id = task_id, completions = results_code[(task_id, 0)][0]))

100%|██████████| 500/500 [00:00<00:00, 485789.21it/s]


In [6]:
write_jsonl("mbpp_samples.jsonl", samples)

In [7]:
!python evaluate_mbpp.py

defaultdict(<class 'list'>, {0: [(0, {'task_id': 0, 'passed': True, 'result': 'passed', 'completion_id': 0}), (1, {'task_id': 0, 'passed': True, 'result': 'passed', 'completion_id': 1}), (2, {'task_id': 0, 'passed': True, 'result': 'passed', 'completion_id': 2}), (3, {'task_id': 0, 'passed': True, 'result': 'passed', 'completion_id': 3}), (4, {'task_id': 0, 'passed': True, 'result': 'passed', 'completion_id': 4})]})
test_program
------------

import re
def remove_uppercase(test_str):
  pattern = re.compile(r'[A-Z]')
  return (re.sub(pattern, "", test_str))

assert remove_uppercase('cAstyoUrFavoRitETVshoWs') == 'cstyoravoitshos'
assert remove_uppercase('wAtchTheinTernEtrAdIo') == 'wtchheinerntrdo'
assert remove_uppercase('VoicESeaRchAndreComMendaTionS') == 'oiceachndreomendaion'
test_program
------------

import re
def remove_uppercase(test_string):
  result = re.sub("[A-Z]+", "", test_string)
  return result

assert remove_uppercase('cAstyoUrFavoRitETVshoWs') == 'cstyoravoitshos'
asser

In [7]:
run_stat = [[] for _ in range(NUM_TASKS)]
run_results = [[] for _ in range(NUM_TASKS)]

for elem in stream_jsonl('mbpp_evaluation.jsonl'):
    task_id = elem['task_id']
    task_num = int(task_id.split('/')[-1])

    run_stat[task_num].append([int(x[1]['passed']) for x in elem['results']])
    run_results[task_num].append([x[1]['result'] for x in elem['results']])

In [8]:
errors = set([])

for elem in run_results:
    #print(elem)
    if elem:
        errors = errors | set(elem[0])

In [46]:
errors

{'failed: AssertionError',
 'failed: AttributeError',
 'failed: IndentationError',
 'failed: IndexError',
 'failed: ModuleNotFoundError',
 'failed: NameError',
 'failed: RecursionError',
 'failed: SyntaxError',
 'failed: TypeError',
 'failed: UnboundLocalError',
 'failed: ValueError',
 'failed: ZeroDivisionError',
 'passed',
 'timed out'}

In [47]:
rare_errors

['failed: ModuleNotFoundError',
 'failed: ZeroDivisionError',
 'failed: UnboundLocalError',
 'failed: IndentationError',
 'failed: AttributeError',
 'timed out']

In [None]:
for elem in list(errors.items()):
    if elem in 

In [48]:
errors_code = {}

for i, e in enumerate(sorted(errors - set(rare_errors))):
    errors_code[e] = i

In [50]:
errors_code

{'failed: AssertionError': 0,
 'failed: IndexError': 1,
 'failed: NameError': 2,
 'failed: RecursionError': 3,
 'failed: SyntaxError': 4,
 'failed: TypeError': 5,
 'failed: ValueError': 6,
 'passed': 7}

In [53]:
for e in rare_errors:
    errors_code[e] = 8

In [79]:
errors_code

{'failed: AssertionError': 0,
 'failed: IndexError': 1,
 'failed: NameError': 2,
 'failed: RecursionError': 3,
 'failed: SyntaxError': 4,
 'failed: TypeError': 5,
 'failed: ValueError': 6,
 'passed': 7,
 'OTHER': 8,
 'failed: ModuleNotFoundError': 8,
 'failed: ZeroDivisionError': 8,
 'failed: UnboundLocalError': 8,
 'failed: IndentationError': 8,
 'failed: AttributeError': 8,
 'timed out': 8}

In [87]:
errors_code = {'failed: AssertionError': 0,
 'failed: IndexError': 1,
 'failed: NameError': 2,
 'failed: RecursionError': 3,
 'failed: SyntaxError': 4,
 'failed: TypeError': 5,
 'failed: ValueError': 6,
 'passed': 7,
 'failed: ModuleNotFoundError': 8,
 'failed: ZeroDivisionError': 8,
 'failed: UnboundLocalError': 8,
 'failed: IndentationError': 8,
 'failed: AttributeError': 8,
 'timed out': 8}

In [None]:
#problems = pickle.load(open('/hcode/mbpp_problems.pickle', 'rb'))

In [None]:
#all_code = []
#
#for elem in samples:
#    for c in elem['completions']:
#        all_code.append(problems[elem['task_id']]['prompt'] + c + '[DONE]')

In [None]:
#pickle.dump(all_code, open('/mbpp_results2/code.pickle', 'wb'))

In [None]:
#pickle.dump(Y, open('/mbpp_results2/Y.pickle', 'wb'))

In [None]:
#data = [prompt_len, correct_prefixes[seq], torch.sum(torch.diag(a_matrix)[:prompt_len]).item(), torch.sum(torch.diag(a_matrix)[prompt_len:]).item()]
#data.extend([probs, calc_mtd(a_matrix, prompt_len, kind = 0), calc_mtd(a_matrix, prompt_len, kind = 1)])
#results[(task_id, seed, seq, layer, head)] = data

### Random split

In [88]:
Y = []

for elem in stream_jsonl('mbpp_evaluation.jsonl'):
    task_id = elem['task_id']
    task_num = int(task_id.split('/')[-1])

    #Y.extend([int(x[1]['passed']) for x in elem['results']])
    Y.extend([errors_code[x[1]['result']] for x in elem['results']])

In [89]:
from collections import Counter

In [90]:
c = Counter(Y)

In [58]:
rare_errors = []

for idx, cnt in sorted(c.items(), key = lambda x : x[1]):
    if cnt < 10:
        rare_errors.append(sorted(errors)[idx])

In [59]:
rare_errors

[]

In [14]:
np.sum(Y), np.mean(Y)

(12396, 4.9584)

In [15]:
len(Y) - np.sum(Y)

-9896

In [16]:
def get_probs():
    all_probs = []

    for task_num in range(10, 510):
        features = []
        task_id = 'MBPP/%d' % task_num
        for seed in range(1):
            for seq in range(5):

                prompt_len = results[(task_id, 0, 0, 0, 0)][0]
                answer_len = results[(task_id, seed, seq, 0, 0)][1] - prompt_len
                probs = results_code[(task_id, 0)][2][seq]
                probs = probs[prompt_len:prompt_len + answer_len]
                        
                all_probs.append(np.mean(np.log(probs)))
    
    return np.array(all_probs)

In [17]:
all_probs = get_probs()

In [18]:
def prepare_features():
    all_features = []

    for task_num in trange(10, 510):
        features = []
        task_id = 'MBPP/%d' % task_num
        for seed in range(1):
            for seq in range(5):

                prompt_len = results[(task_id, 0, 0, 0, 0)][0]
                answer_len = results[(task_id, seed, seq, 0, 0)][1] - prompt_len
                f_sample = [0*prompt_len, 0*answer_len]
                                
                for layer in range(32):
                    for head in range(32):
                        f = results[(task_id, seed, seq, layer, head)]
                        f = [f[2] / prompt_len, f[3] / answer_len, f[4][0] / prompt_len, f[4][1] / prompt_len, f[5][0] / answer_len, f[5][1] / answer_len]
                        f_sample.extend(f)
                        
                all_features.append(f_sample)
    
    return all_features

In [19]:
f_names = ['prompt_len', 'answer_len']
cnt = 2

f = ['prompt_self_att', 'answer_self_att', 'mtd_a_h0', 'mtd_a_h1', 'mtd_b_h0', 'mtd_b_h1']
        
for layer in range(32):
    for head in range(32):
        for f1 in f:
            f_names.append('%d_%s_%d_%d' % (cnt, f1, layer, head))
            cnt += 1

In [20]:
len(f_names)

6146

In [21]:
X = prepare_features()
X = np.array(X)
Y = np.array(Y)

100%|██████████| 500/500 [00:35<00:00, 14.13it/s]


In [22]:
X.shape, len(Y)

((2500, 6146), 2500)

In [23]:
pickle.dump((X,Y), open('mbpp_dataset.pickle', 'wb'))

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### XGBoost

#### dataset transfer

In [None]:
X = all_probs.reshape((X.shape[0], 1))

In [None]:
X.shape

In [None]:
X = np.concatenate((X, all_probs.reshape((X.shape[0], 1))), axis = 1)

In [None]:
X.shape

In [None]:
%%time

#clf = xgb.XGBClassifier(tree_method="hist", max_bin = 64, n_estimators = 1000, eta = 0.1).fit(X, Y)
clf = xgb.XGBClassifier(tree_method="hist").fit(X, Y)
clf.save_model('mbpp.xgb')

In [None]:
clf.load_model('he.xgb')

In [None]:
roc_auc_score(Y, clf.predict_proba(X)[:,1])

In [None]:
f1_score(Y, clf.predict(X))

In [None]:
%%time

clf = xgb.XGBClassifier(tree_method="hist").fit(X_train, y_train)

In [None]:
y_pred = clf.predict_proba(X_test)[:,1]

In [None]:
roc_auc_score(y_test, y_pred)

### Cross-validate XGBoost

In [None]:
clf_obj = xgb.XGBClassifier(tree_method="hist", max_bin = 64)
cv_res = cross_validate(clf_obj, X, Y, scoring = 'roc_auc', cv = KFold(n_splits = 5, shuffle = True, random_state = 42))

In [None]:
np.mean(cv_res['test_score'])

### Feature importance

In [None]:
for elem in sorted(list(enumerate(zip(clf.feature_importances_, f_names))), key = lambda x : -abs(x[1][0])):
    print(elem)

In [None]:
fstr = np.zeros((32, 32))

for i in range(clf.feature_importances_.shape[0]):
    if i >= 2:
        layer, head = int(f_names[i].split('_')[-2]), int(f_names[i].split('_')[-1])
        fstr[layer, head] += clf.feature_importances_[i]

fig, ax = plt.subplots()

im = ax.imshow(fstr, origin = 'lower')

#ax.set_title('Pan on the colorbar to shift the color mapping\n'
#             'Zoom on the colorbar to scale the color mapping')

fig.colorbar(im, ax=ax, label='Feature impportance')

plt.show()

In [None]:
def plot5box(num_list):
    fig, axes = plt.subplots(1, 5, figsize=(15, 3))
    
    for i in range(5):
        plot_feature_box(axes[i], num_list[i])

def plot_feature_box(ax, num):
    A = list(zip(X_train[:, num], y_train))
    ax.boxplot(x = [[z[0] for z in list(filter(lambda x : x[1] == 0, A))], [z[0] for z in list(filter(lambda x : x[1] == 1, A))]])
    ax.legend(loc='best')
    ax.set_xticklabels(['0', '1'])
    #plt.show()


In [None]:
def plot5hist(num_list):
    fig, axes = plt.subplots(1, 5, figsize=(15, 3))
    
    for i in range(5):
        plot_feature(axes[i], num_list[i])

def plot_feature(ax, num):
    A = list(zip(X_train[:, num], y_train))
    ax.hist([z[0] for z in list(filter(lambda x : x[1] == 0, A))], bins = 20, alpha = 0.5, density = True, label = '0')
    ax.hist([z[0] for z in list(filter(lambda x : x[1] == 1, A))], bins = 20, alpha = 0.5, density = True, label = '1')
    #ax.set_xscale('log')
    #ax.set_yscale('log')
    ax.set_xlabel(f_names[num])
    ax.legend(loc='best')
    #plt.show()


In [None]:
plot5hist([3134, 3039, 2192, 2156, 2739])

In [None]:
plot5hist([6028, 3330, 3949, 2633, 2739])

In [None]:
import shap
explainer = shap.TreeExplainer(clf.get_booster())
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names = f_names)

In [None]:
shap.summary_plot(shap_values, X_test, feature_names = f_names)

In [None]:
def plot5hist(num_list):
    fig, axes = plt.subplots(1, 5, figsize=(15, 3))
    
    for i in range(5):
        plot_feature_hist(axes[i], num_list[i])

def plot_feature_hist(ax, num):
    A = list(zip(X_train[:, num], y_train))
    
    B = [z[0] for z in list(filter(lambda x : x[1] == 0, A))]
    ax.hist(B, bins = np.arange(-5, 15.5, 0.25), alpha = 0.5, density = True, label = '0')
    #ax.hist(B, alpha = 0.5, density = True, label = '0')
    
    B = [z[0] for z in list(filter(lambda x : x[1] == 1, A))]
    ax.hist(B, bins = np.arange(-5, 15.5, 0.25), alpha = 0.5, density = True, label = '0')    
    #ax.hist(B, alpha = 0.5, density = True, label = '1')
    
    #ax.set_xscale('log')
    #ax.set_yscale('log')
    ax.set_xlim(-1, 5.1)
    ax.set_xlabel(f_names[num])
    ax.legend(loc='best')
    #plt.show()

In [None]:
plot5hist([5751, 3995, 2897, 3039, 2831])

### Single feature correlations

In [None]:
data = []

for i in range(X_train.shape[1]):
    data.append((kendalltau(X_train[:, i], y_train).statistic, i, f_names[i]))

In [None]:
data = list(filter(lambda x : not np.isnan(x[0]), data))

In [None]:
sorted(data, key = lambda x : -np.abs(x[0]))

In [None]:
X_train[:, 2633]

In [None]:
plot5hist([2633, 2897, 5231, 3647, 3629])

In [None]:
plot5hist([6028, 3322, 4876, 3688, 3796])

In [None]:
plot5box([6028, 3322, 4876, 3688, 3796])

In [None]:
plot5hist([2633, 5231, 2897, 3167, 3995])

In [None]:
plot5hist([6042, 6144, 2897, 3167, 3995])

In [None]:
plot5box([2633, 5231, 2897, 3167, 3995])

In [None]:
data_he = pickle.load(open('he_kendall.pickle', 'rb'))

In [None]:
data_mbpp_f = list(filter(lambda x : abs(x[0])>.20, data))

In [None]:
data_he_f = list(filter(lambda x : abs(x[0])>.20, data_he))

In [None]:
common = set([x[2] for x in data_mbpp_f]) & set([x[2] for x in data_he_f])

In [None]:
len(common)

In [None]:
data_mbpp_f = list(filter(lambda x : x[2] in common, data_mbpp_f))
data_he_f = list(filter(lambda x : x[2] in common, data_he_f))

In [None]:
len(data_mbpp_f), len(data_he_f)

In [None]:
plt.scatter(x = [x[0] for x in data_he_f], y = [x[0] for x in data_mbpp_f], s = 2)

In [None]:
sorted([(-abs(x[0]), x[1], x[2]) for x in data_mbpp_f])

In [None]:
def plot5hist(num_list):
    fig, axes = plt.subplots(1, 5, figsize=(15, 3))
    
    for i in range(5):
        plot_feature_hist(axes[i], num_list[i])

def plot_feature_hist(ax, num):
    A = list(zip(X[:, num], Y))


    B = [z[0] for z in list(filter(lambda x : x[1] == 1, A))]
    #ax.hist(B, bins = np.arange(-15.5, 15.5, 0.25), alpha = 0.5, density = True, label = '1')
    ax.hist(B, alpha = 0.5, density = True, label = '1')
    
    B = [z[0] for z in list(filter(lambda x : x[1] == 0, A))]
    ax.hist(B, alpha = 0.5, density = True, label = '0')

    
    #ax.set_xscale('log')
    #ax.set_yscale('log')
    ax.set_xlim(-5, 5.1)
    ax.set_xlabel(f_names[num])
    ax.legend(loc='best')
    #plt.show()

In [None]:
X_train[:,3039]

In [None]:
plot5hist([3039, 5231, 3629, 4053, 3047])

In [None]:
# 3047, 'mtd_a_1_15_27' --- 3629, 'mtd_a_1_18_28' --- 5231, 'mtd_a_1_27_7'

In [None]:
plot5hist([2633, 5231, 3167, 3629, 3647])

In [None]:
plot5box([2633, 5231, 3167, 3629, 2897])

In [None]:
for i in range(X[:, 3047].shape[0]):

    task_num = 10 + i // 5
    
    if X[i, 3047] >= 1.5:
        print(i, Y[i],  X[i, 3047], run_stat[task_num], task_num, i % 5)

In [None]:
data = []

for i in range(X_train.shape[1]):
    data.append((max(roc_auc_score(y_train, X_train[:, i]), roc_auc_score(y_train, -X_train[:, i])), i, f_names[i]))

In [None]:
sorted(data, key = lambda x : -np.abs(x[0]))

In [None]:
list(filter(lambda x : x [1] == 6144, [x for x in data]))

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

def plot5hist(num_list):
    fig, axes = plt.subplots(1, 5, figsize=(15, 3))
    
    for i in range(5):
        plot_feature_hist(axes[i], num_list[i])

def plot_feature_hist(ax, num):
    A = list(zip(X_train[:, num], y_train))

    B = [z[0] for z in list(filter(lambda x : x[1] == 1, A))]
    ax.hist(B, bins = np.arange(0., 1., 0.05), alpha = 0.5, density = True, label = '1')
    
    B = [z[0] for z in list(filter(lambda x : x[1] == 0, A))]
    ax.hist(B, bins = np.arange(0., 1., 0.05), alpha = 0.5, density = True, label = '0')

    #ax.set_xscale('log')
    #ax.set_yscale('log')
    #ax.set_xlim(-5, 5.1)
    idx, mtd, kind, h, layer, head = f_names[num].split('_')
    h = h[1]
    if mtd == 'mtd':
        name = 'MTD$_%s$(%s),layer=%s,head=%s' % (h, "P,G" if kind == 'a' else 'G,P', layer, head)
    else:
        name = 'sum_diag(%s),layer=%s,head=%s' % ("P" if kind == 'P' else 'G', layer, head)
    ax.set_xlabel(name)
    ax.legend(loc='best')
    #plt.show()

In [None]:
plot5hist([2633, 5231, 3647, 3039, 2499])

In [None]:
plot5hist([1123, 5231, 3445, 3039, 2499])

In [None]:
plot5hist([6028, 3445, 6144, 3039, 2499])

In [None]:
plot5hist([5231, 2633, 3647, 1123, 3445])

In [None]:
accuracy_score(y_test, y_pred > 0.5)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.plot(recall, precision)
plt.ylim(0, 1.01)
plt.xlim(0, 1.01)

### Split by task

In [148]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [153]:
def calc_quality_group(train_idx, test_idx):

    X_train = []
    X_test = []
    y_train = []
    y_test = []
    probs_train = []
    probs_test = []

    test_info = []
    test_task_nums = set()
    
    for i in range(X.shape[0]):
        if i in train_idx:
            X_train.append(X[i])
            y_train.append(Y[i])
            probs_train.append(all_probs[i])
        else:
            X_test.append(X[i])
            y_test.append(Y[i])
            test_info.append(i // 5)
            probs_test.append(all_probs[i])

    #
    # scaling is optional
    #
    #scaler = MinMaxScaler()
    #scaler.fit(X_train)
    #X_train = scaler.transform(X_train)
    #X_test = scaler.transform(X_test)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    #X_train = np.array(probs_train).reshape(len(probs_train), 1)
    #X_test = np.array(probs_test).reshape(len(probs_test), 1)
    
    print(X_train.shape)
    print(X_test.shape)

    #clf = MLPClassifier(hidden_layer_sizes=(128, 128, ), max_iter = 1000, early_stopping = True)
    #clf = xgb.XGBClassifier(tree_method="hist", max_bin = 64, n_estimators = 1000, eta = 0.1)
    clf = xgb.XGBClassifier(tree_method="hist", max_bin = 64)
    clf.fit(X_train, y_train)    
    #print(clf.n_iter_)
    y_pred = clf.predict_proba(X_test)[:,1]
    y_pred_class = clf.predict(X_test)

    cm = confusion_matrix(y_test, y_pred_class)
    #y_pred = probs_test
    #y_pred_class = [int(x > 0.5) for x in probs_test]

    return f1_score(y_test, y_pred_class, average = 'weighted'), cm, accuracy_score(y_test, y_pred_class)

In [154]:
groups = []

for i in range(500):
    for j in range(5):
        groups.append(i)

kf = StratifiedGroupKFold(n_splits = 5, shuffle = True, random_state = 42)

splits = []

for train_idx, test_idx in kf.split(range(X.shape[0]), Y, groups):
    splits.append((train_idx, test_idx))

In [155]:
#pickle.dump(splits, open('/mbpp_results2/splits.pickle', 'wb'))

In [156]:
groups = []

for i in range(500):
    for j in range(5):
        groups.append(i)

kf = StratifiedGroupKFold(n_splits = 5, shuffle = True, random_state = 42)
res_cm = []
res_f1 = []
res_acc = []
all_task_res = []

for train_idx, test_idx in kf.split(range(X.shape[0]), Y, groups):
    #print("%s %s" % (train_idx, test_idx))
    f1, cm, acc = calc_quality_group(train_idx, test_idx)
    res_f1.append(f1)
    res_cm.append(cm)
    res_acc.append(acc)

(2000, 6146)
(500, 6146)
(2000, 6146)
(500, 6146)
(2000, 6146)
(500, 6146)
(2000, 6146)
(500, 6146)
(2000, 6146)
(500, 6146)


In [158]:
np.mean(res_f1), np.std(res_f1)

(0.6236372475628997, 0.024165156149113715)

In [159]:
np.mean(res_acc), np.std(res_acc)

(0.6599999999999999, 0.019514097468240767)

In [160]:
T = res_cm[0] + res_cm[1] + res_cm[2] + res_cm[3] + res_cm[4]

In [161]:
T

array([[1089,    0,    0,    0,    1,    1,    0,  240,    0],
       [  44,    1,    0,    0,    0,    0,    0,    1,    0],
       [  52,    0,    4,    0,    0,    0,    0,   18,    0],
       [  13,    0,    0,    0,    0,    0,    0,    0,    0],
       [  13,    0,    0,    0,    0,    0,    0,    1,    1],
       [  68,    0,    0,    0,    0,    7,    0,   18,    0],
       [  21,    0,    0,    0,    0,    0,    0,    3,    0],
       [ 329,    0,    0,    0,    0,    1,    0,  549,    0],
       [  17,    0,    1,    0,    2,    0,    0,    5,    0]])

In [126]:
T_sum = np.sum(T, axis = 1)

In [132]:
np.sum(T[0]) - T[i][7]

1331

In [163]:
for i in range(9):
    if i != 7:
        print((np.sum(T[i]) - T[i][7]) / T_sum[i], list(filter(lambda x: x[1] == i, [x for x in errors_code.items()])))
    else:
        print((np.sum(T[i]) - T[i][0]) / T_sum[i], list(filter(lambda x: x[1] == i, [x for x in errors_code.items()])))

0.8196844477836214 [('failed: AssertionError', 0)]
0.9782608695652174 [('failed: IndexError', 1)]
0.7567567567567568 [('failed: NameError', 2)]
1.0 [('failed: RecursionError', 3)]
0.9333333333333333 [('failed: SyntaxError', 4)]
0.8064516129032258 [('failed: TypeError', 5)]
0.875 [('failed: ValueError', 6)]
0.6257110352673493 [('passed', 7)]
0.8 [('failed: ModuleNotFoundError', 8), ('failed: ZeroDivisionError', 8), ('failed: UnboundLocalError', 8), ('failed: IndentationError', 8), ('failed: AttributeError', 8), ('timed out', 8)]


In [139]:
list(filter(lambda x: x[1] == 5, [x for x in errors_code.items()]))

[('failed: TypeError', 5)]

In [130]:
T[1] / T_sum[1]

array([0.95652174, 0.02173913, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02173913, 0.        ])

In [124]:
T / np.sum(T, axis = 1).T

array([[0.81818182, 0.        , 0.        , 0.        , 0.06666667,
        0.01075269, 0.        , 0.27303754, 0.        ],
       [0.03305785, 0.02173913, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.00113766, 0.        ],
       [0.03906837, 0.        , 0.05405405, 0.        , 0.        ,
        0.        , 0.        , 0.02047782, 0.        ],
       [0.00976709, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.00976709, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.00113766, 0.04      ],
       [0.05108941, 0.        , 0.        , 0.        , 0.        ,
        0.07526882, 0.        , 0.02047782, 0.        ],
       [0.01577761, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.00341297, 0.        ],
       [0.24718257, 0.        , 0.        , 0.        , 0.        ,
        0.01075269, 0.        , 0.62457338, 0.        ],


In [119]:
np.sum(T, axis = 1)

array([1331,   46,   74,   13,   15,   93,   24,  879,   25])

In [111]:
reduce 

NameError: name 'reduce' is not defined