In [None]:
import numpy as np
import torch

# features = np.load("bert-base-cased_in_domain_train.npy")

In [None]:
print(torch.cuda.is_available())

True


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m110.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
def cutoff_matrix(matrix, ntokens):
    """Return normalized submatrix of first n_tokens"""
    matrix = matrix[:ntokens, :ntokens]
    matrix /= matrix.sum(axis=1, keepdims=True)
    return matrix

In [None]:
from copy import deepcopy
from time import sleep

import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import pairwise_distances
from scipy import sparse
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

def get_filtered_mat_list(adj_matrix, thresholds_array, ntokens):
    """
    Converts adjancency matrix with real weights into list of binary matricies.
    For each threshold, those weights of adjancency matrix, which are less than
    threshold, get "filtered out" (set to 0), remained weights are set to ones.
    Args:
        adj_matrix (np.array[float, float])
        thresholds_array (iterable[float])
        n_tokens (int)
    Returns:
        filtered_matricies (list[np.array[int, int]])
    """
    filtered_matricies = []
    for thr in thresholds_array:
        filtered_matrix = adj_matrix.copy()
        filtered_matrix = cutoff_matrix(filtered_matrix, ntokens)
        filtered_matrix[filtered_matrix <  thr] = 0
        filtered_matrix[filtered_matrix >= thr] = 1
        filtered_matricies.append(filtered_matrix.astype(np.int8))
    return filtered_matricies

def adj_m_to_nx_list(adj_matrix, thresholds_array, ntokens, no_mat_output=False):
    """
    Converts adjancency matrix into list of unweighted digraphs, using filtering
    process from previous function.
    Args:
        adj_matrix (np.array[float, float])
        thresholds_array (iterable[float])
        n_tokens (int)
    Returns:
        nx_graphs_list (list[nx.MultiDiGraph])
        filt_mat_list(list[np.array[int, int]])
    """
#     adj_matrix = adj_matrix[:length,:length]
    filt_mat_list = get_filtered_mat_list(adj_matrix, thresholds_array, ntokens)
    nx_graphs_list = []
    for mat in filt_mat_list:
        nx_graphs_list.append(nx.from_numpy_matrix(np.array(mat), \
                              create_using=nx.MultiDiGraph()))
    if no_mat_output:
        return nx_graphs_list, []
    else:
        return nx_graphs_list, filt_mat_list

def adj_ms_to_nx_lists(adj_matricies, \
                       thresholds_array, \
                       ntokens_array, \
                       verbose=True, \
                       no_mat_output=False):
    """
    Executes adj_m_to_nx_list for each matrix in adj_matricies array, arranges
    the results. If verbose==True, shows progress bar.
    Args:
        adj_matrix (np.array[float, float])
        thresholds_array (iterable[float])
        verbose (bool)
    Returns:
        nx_graphs_list (list[nx.MultiDiGraph])
        filt_mat_lists (list[list[np.array[int,int]]])
    """
    graph_lists = []
    filt_mat_lists = []

    iterable = range(len(adj_matricies))
    if verbose:
        iterable = tqdm(range(len(adj_matricies)),
                        desc="Calc graphs list")
    for i in iterable:
        g_list, filt_mat_list = adj_m_to_nx_list(adj_matricies[i],\
                                                 thresholds_array,\
                                                 ntokens_array[i], \
                                                 no_mat_output=no_mat_output)
        graph_lists.append(g_list)
        filt_mat_lists.append(filt_mat_lists)

    return graph_lists, filt_mat_lists

def count_stat(g_listt_j, function=nx.weakly_connected_components, cap=500):
    """
    Calculates stat (topological feature), using the function, which returns a
    generator (for example, generator of simple cycles in the DiGraph).
    Args:
        g_listt_j (list[nx.MultiDiGraph])
        function (function)
        cap (int)
    Returns:
        stat_amount (int)
    """
    stat_amount = 0
    for _ in function(g_listt_j):
        stat_amount += 1
        if stat_amount >= cap:
            break
    return stat_amount

def count_weak_components(g_listt_j, cap=500):
    return count_stat(g_listt_j,
                      function=nx.weakly_connected_components,
                      cap=cap)

def count_strong_components(g_listt_j, cap=500):
    return count_stat(g_listt_j,
                      function=nx.strongly_connected_components,
                      cap=cap)

def count_simple_cycles(g_listt_j, cap=500):
    return count_stat(g_listt_j, function=nx.simple_cycles, cap=cap)

def dim_connected_components(graph_lists, strong=False, verbose=False, cap=500):
    """
    Calculates amount of connected components for each graph in list
    of lists of digraphs. If strong==True, calculates strongly connected
    components, otherwise calculates weakly connected components.
    If verbose==True, shows progress bar.
    Args:
        graph_lists (list[list[nx.MultiDiGraph]])
        strong (bool)
        verbose (bool)
    Returns:
        w_lists (list[list[int])
    """
    w_lists = [] # len == len(w_graph_lists)
    iterable = range(len(graph_lists))
    if verbose:
        iterable = tqdm(range(len(graph_lists)),
                        desc="Calc weak comp")
    for i in iterable:
        g_list = graph_lists[i]
        w_cmp  = []
        for j in range(len(g_list)):
            if strong:
                w_cmp.append(count_strong_components(g_list[j], cap=cap))
            else:
                w_cmp.append(count_weak_components(g_list[j], cap=cap))
        w_lists.append(w_cmp)
    return w_lists

def dim_simple_cycles(graph_lists, verbose, cap=500):
    """
    Calculates amount of simple cycles for each graph in list
    of lists of digraphs. If verbose==True, shows progress bar.
    Args:
        graph_lists (list[list[nx.MultiDiGraph]])
        verbose (bool)
    Returns:
        c_lists (list[list[int])
    """
    c_lists = [] # len == len(pos_w_graph_lists)
    iterable = range(len(graph_lists))
    if verbose:
        iterable = tqdm(range(len(graph_lists)),
                        desc="Calc cycles")
    for i in iterable:
        g_list = graph_lists[i]
        c  = []
        for j in range (len(g_list)):
            c.append(count_simple_cycles(g_list[j], cap=cap))
        c_lists.append(c)
    return c_lists

def b0_b1(graph_lists, verbose):
    """
    Calculates first two Betti numbers for each graph in list of lists of
    digraphs. If verbose==True, shows progress bar.
    Args:
        graph_lists (list[list[nx.MultiDiGraph]])
        verbose (bool)
    Returns:
        b0_lists (list[list[int])
        b1_lists (list[list[int])
    """
    b0_lists = []
    b1_lists = [] # len == len(pos_w_graph_lists)
    iterable = range(len(graph_lists))
    if verbose:
        iterable = tqdm(range(len(graph_lists)),
                        desc="Calc b0, b1")
    for i in iterable:
        g_list = graph_lists[i]
        b0 = []
        b1 = []
        for j in range (len(g_list)):
            g = nx.Graph(g_list[j].to_undirected())
            w = nx.number_connected_components(g)
            e = g.number_of_edges()
            v = g.number_of_nodes()
            b0.append(w)
            b1.append(e - v + w)
            #print(b1)
        b0_lists.append(b0)
        b1_lists.append(b1)
    return b0_lists, b1_lists

def edges_f(graph_lists, verbose):
    """
    Calculates amount of edges for each graph in list
    of lists of digraphs. If verbose==True, shows progress bar.
    Args:
        graph_lists (list[list[nx.MultiDiGraph]])
        verbose (bool)
    Returns:
        e_lists (list[list[int])
    """
    e_lists = [] # len == len(pos_w_graph_lists)
    iterable = range(len(graph_lists))
    if verbose:
        iterable = tqdm(range(len(graph_lists)),
                        desc="Calc edges amount")
    for i in iterable:
        g_list = graph_lists[i]
        e  = []
        for j in range (len(g_list)):
            e.append(g_list[j].number_of_edges())
        e_lists.append(e)
    return e_lists

def v_degree_f(graph_lists, verbose):
    """
    Calculates amount of edges for each graph in list
    of lists of digraphs. If verbose==True, shows progress bar.
    Args:
        graph_lists (list[list[nx.MultiDiGraph]])
        verbose (bool)
    Returns:
        v_lists (list[list[int])
    """
    v_lists = [] # len == len(pos_w_graph_lists)
    iterable = range(len(graph_lists))
    if verbose:
        iterable = tqdm(range(len(graph_lists)),
                        desc="Calc average vertex degree")
    for i in iterable:
        g_list = graph_lists[i]
        v  = []
        for j in range (len(g_list)):
            #print(g_list[j])
            degrees = g_list[j].degree()
            degree_values = [v for k, v in degrees]
            sum_of_edges = sum(degree_values) / float(len(degree_values))
            v.append(sum_of_edges)
        v_lists.append(v)
    return v_lists

def H_1_statistics_by_thresholds(thresholds_array, \
                                 c_lists, \
                                 lang_list, \
                                 layer, \
                                 head):
    """
    Shows statistics of topological invariants from c_lists for each language
    from multi-language corpora, on the plot.
    Args:
        thresholds_array (iterable[float])
        c_lists (list[list[int]])
        lang_list (list[str])
        layer (int)
        head (int)
    Returns:
        None
    """

    T = len(thresholds_array)
    L = len(lang_list)
    fig, axs = plt.subplots(T, L, figsize=(16,16))

    colors = ['b', 'c', 'g', 'y', 'r', 'm']

    for l in range(L):
        current_color = colors[l % len(colors)]
        # Each language is assigned a color from colors list.
        max_amount = 0
        for t in range(T):
            current_data = np.array(c_lists[(layer, head)][l])
            axs[t, l].bar(current_data[l][t], \
                          color=current_color)
            axs[t, l].set_title(lang_list[k])
            max_amount = np.max(max_amount, current_data[l][t])
        for t in range(T):
            axs[t, l].set_ylim([0, max_amount * 1.1])

    plt.show()


def count_top_stats(adj_matricies,
                    thresholds_array,
                    ntokens_array,
                    stats_to_count={"s", "e", "c", "v", "b0b1"},
                    stats_cap=500,
                    sleep_time=0,
                    verbose=False):
    """
    The main function for calculating topological invariants. Unites the
    functional of all functions above.
    Args:
        adj_matricies (np.array[float, float, float, float, float])
        thresholds_array (list[float])
        stats_to_count (str)
        function_for_v (function)
        stats_cap (int)
        verbose (bool)
    Returns:
        stats_tuple_lists_array (np.array[float, float, float, float, float])
    """
    stats_tuple_lists_array = []

    for layer_of_interest in tqdm(range(adj_matricies.shape[1])):
        stats_tuple_lists_array.append([])
        for head_of_interest in range(adj_matricies.shape[2]):
            sleep(sleep_time)
            adj_ms = adj_matricies[:,layer_of_interest,head_of_interest,:,:]
            g_lists, _ = adj_ms_to_nx_lists(adj_ms,
                                            thresholds_array=thresholds_array,
                                            ntokens_array=ntokens_array,
                                            verbose=False)
            feat_lists = []
            if "s" in stats_to_count:
                feat_lists.append(dim_connected_components(g_lists,
                                                           strong=True,
                                                           verbose=False,
                                                           cap=stats_cap))
            if "w" in stats_to_count:
                feat_lists.append(dim_connected_components(g_lists,
                                                           strong=False,
                                                           verbose=False,
                                                           cap=stats_cap))
            if "e" in stats_to_count:
                feat_lists.append(edges_f(g_lists, verbose=False))
            if "v" in stats_to_count:
                feat_lists.append(v_degree_f(g_lists, verbose=False))
            if "c" in stats_to_count:
                feat_lists.append(dim_simple_cycles(g_lists,
                                                    verbose=False,
                                                    cap=stats_cap))
            if "b0b1" in stats_to_count:
                b0_lists, b1_lists = b0_b1(g_lists, verbose=False)
                feat_lists.append(b0_lists)
                feat_lists.append(b1_lists)
            stats_tuple_lists_array[-1].append(tuple(feat_lists))

    stats_tuple_lists_array = np.asarray(stats_tuple_lists_array,
                                         dtype=np.float16)
    return stats_tuple_lists_array

In [None]:
device = "cuda"

In [None]:
import os
import re
import numpy as np

from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel

def grab_attention_weights(model, tokenizer, sentences, MAX_LEN, device='cuda:0'):
    inputs = tokenizer.batch_encode_plus([text_preprocessing(s) for s in sentences],
                                       return_tensors='pt',
                                       add_special_tokens=True,
                                       max_length=MAX_LEN,             # Max length to truncate/pad
                                       pad_to_max_length=True,         # Pad sentence to max length)
                                       truncation=True
                                      )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs["token_type_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    attention = model(input_ids, attention_mask, token_type_ids)['attentions']
    # layer X sample X head X n_token X n_token
    attention = np.asarray([layer.cpu().detach().numpy() for layer in attention], dtype=np.float16)

    return attention

def grab_weights_for_all(reviews,
                         model_name='bert-base-multilingual-cased',
                         layer_of_interest=-1,
                         head_of_interest=0,
                         recalculate=True,
                         output_file='adj_matricies.npy'
                         ):
    """
    Returns attention weights (matricies) for each sentence from reviews, for
    chosen layer and head. If recalculate==False, loads such weights from .npy
    file. Otherwise, calculates them and saves into .npy file.
    Args:
        reviews (list[str])
        model_name (str)
        layer_of_interest (int)
        head_of_interest (int),
        recalculate (bool),
        output_file (str).
    Returns:
        np.array[int,int,int]
    """

    model = BertModel.from_pretrained(model_name, output_attentions=True)
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)

    adj_matricies = []

    if r_file.is_file() and not recalculate:
        adj_matricies = np.load(r_file, allow_pickle=True)
        print("Загружены ранее вычисленные веса.")

    else:
        for i in tqdm(range(len(reviews)),
                            desc="Вычисление весов"):
            attention_w = grab_attention_weights(model, tokenizer, reviews[i])
            adj_matrix  = attention_w[layer_of_interest].detach().numpy()[0][head_of_interest]
            adj_matricies.append(adj_matrix)

        adj_matricies = np.asarray(adj_matricies)
        np.save(r_file, adj_matricies)

        print("Результаты вычисления сохранены в файл", r_file, ".")

    return adj_matricies

def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
from collections import defaultdict
import itertools
import re
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
np.random.seed(42)

In [None]:
max_tokens_amount  = 128
stats_cap          = 500 # NOT applicable to Betty numbers.

layers_of_interest = [i for i in range(12)]
stats_name = "s_e_v_c_b0b1"

thresholds_array = [0.025, 0.05, 0.1, 0.25, 0.5, 0.75]
thrs = len(thresholds_array)

model_path = tokenizer_path = "bert-base-uncased"

In [None]:
subset = "test_5k"           # .csv file with the texts, for which we count topological features
input_dir = "small_gpt_web/"  # Name of the directory with .csv file
output_dir = "small_gpt_web/" # Name of the directory with calculations results

prefix = output_dir + subset

r_file     = output_dir + 'attentions/' + subset  + "_all_heads_" + str(len(layers_of_interest)) + "_layers_MAX_LEN_" + \
             str(max_tokens_amount) + "_" + model_path.split("/")[-1]
# Name of the file for attention matrices weights

stats_file = output_dir + 'features/' + subset + "_all_heads_" + str(len(layers_of_interest)) + "_layers_" + stats_name \
             + "_lists_array_" + str(thrs) + "_thrs_MAX_LEN_" + str(max_tokens_amount) + \
             "_" + model_path.split("/")[-1] + '.npy'
# Name of the file for topological features array

In [None]:
stats_file

'small_gpt_web/features/test_5k_all_heads_12_layers_s_e_v_c_b0b1_lists_array_6_thrs_MAX_LEN_128_bert-base-uncased.npy'

In [None]:
r_file

'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased'

In [None]:
try:
    data = pd.read_csv("/content/sample_data/test_5k.csv", nrows=100).reset_index(drop=True)
except:
    #data = pd.read_csv(input_dir + subset + ".tsv", delimiter="\t")
    data = pd.read_csv(input_dir + subset + ".tsv", delimiter="\t", header=None)
    data.columns = ["0", "labels", "2", "sentence"]

In [None]:
print(data.shape)

(100, 6)


In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,ended,length,sentence,label
0,4722,259722,True,231,The Learning Co.\n\nDeveloped by\n\nThe Learni...,natural
1,2757,257813,True,563,Bush doubles down on foreign policy on Saturda...,generated
2,2194,257194,True,62,Here are six interesting things you need to kn...,natural
3,817,255817,True,293,Introduction\n\nWe would like to thank Antec f...,natural
4,3886,258886,False,1024,"ELKRIDGE, Md.—A group called ""Muslims for Trum...",natural


In [None]:
sentences = data['sentence']
print("Average amount of words in example:", \
      np.mean(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))
print("Max. amount of words in example:", \
      np.max(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))
print("Min. amount of words in example:", \
      np.min(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))

Average amount of words in example: 2863.33
Max. amount of words in example: 5057
Min. amount of words in example: 130


In [None]:
def get_token_length(batch_texts):
    inputs = tokenizer.batch_encode_plus(batch_texts,
       return_tensors='pt',
       add_special_tokens=True,
       max_length=MAX_LEN,             # Max length to truncate/pad
       pad_to_max_length=True,         # Pad sentence to max length
       truncation=True
    )
    inputs = inputs['input_ids'].numpy()
    n_tokens = []
    indexes = np.argwhere(inputs == tokenizer.pad_token_id)
    for i in range(inputs.shape[0]):
        ids = indexes[(indexes == i)[:, 0]]
        if not len(ids):
            n_tokens.append(MAX_LEN)
        else:
            n_tokens.append(ids[0, 1])
    return n_tokens

In [None]:
MAX_LEN = max_tokens_amount
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)

In [None]:
data['tokenizer_length'] = get_token_length(data['sentence'].values)

In [None]:
ntokens_array = data['tokenizer_length'].values

In [None]:
data['label'] = data['label']

Unnamed: 0.1,Unnamed: 0,id,ended,length,sentence,label,tokenizer_length
0,4722,259722,True,231,The Learning Co.\n\nDeveloped by\n\nThe Learni...,natural,128
1,2757,257813,True,563,Bush doubles down on foreign policy on Saturda...,generated,128
2,2194,257194,True,62,Here are six interesting things you need to kn...,natural,71
3,817,255817,True,293,Introduction\n\nWe would like to thank Antec f...,natural,128
4,3886,258886,False,1024,"ELKRIDGE, Md.—A group called ""Muslims for Trum...",natural,128


In [None]:
from math import ceil

batch_size = 10 # batch size
# number_of_batches = ceil(len(data['sentence']) / batch_size)
number_of_batches = 10
DUMP_SIZE = 10 # number of batches to be dumped - was 100
batched_sentences = np.array_split(data['sentence'].values, number_of_batches)
number_of_files = ceil(number_of_batches / DUMP_SIZE)
adj_matricies = []
adj_filenames = []
assert number_of_batches == len(batched_sentences) # sanity check

In [None]:
device='cuda'
model = BertForSequenceClassification.from_pretrained(model_path, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
model = model.to(device)
MAX_LEN = max_tokens_amount

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
print(device)

cuda


In [None]:
for i in tqdm(range(number_of_batches), desc="Weights calc"):
    attention_w = grab_attention_weights(model, tokenizer, batched_sentences[i], max_tokens_amount, device)
    # sample X layer X head X n_token X n_token
    adj_matricies.append(attention_w)
    if (i+1) % DUMP_SIZE == 0: # dumping
        print(f'Saving: shape {adj_matricies[0].shape}')
        adj_matricies = np.concatenate(adj_matricies, axis=1)
        print("Concatenated")
        adj_matricies = np.swapaxes(adj_matricies, axis1=0, axis2=1) # sample X layer X head X n_token X n_token
        filename = r_file + "_part" + str(ceil(i/DUMP_SIZE)) + "of" + str(number_of_files) + '.npy'
        print(f"Saving weights to : {filename}")
        adj_filenames.append(filename)
        np.save(filename, adj_matricies)
        adj_matricies = []

if len(adj_matricies):
    filename = r_file + "_part" + str(ceil(i/DUMP_SIZE)) + "of" + str(number_of_files) + '.npy'
    print(f'Saving: shape {adj_matricies[0].shape}')
    adj_matricies = np.concatenate(adj_matricies, axis=1)
    print("Concatenated")
    adj_matricies = np.swapaxes(adj_matricies, axis1=0, axis2=1) # sample X layer X head X n_token X n_token
    print(f"Saving weights to : {filename}")
    np.save(filename, adj_matricies)

print("Results saved.")

In [None]:
print(adj_matricies)

[[[[[7.4005e-03 1.6800e-02 5.8975e-03 ... 7.3853e-03 5.6229e-03
     2.3788e-02]
    [1.0681e-02 1.1429e-02 5.5008e-03 ... 1.3832e-02 5.5847e-03
     8.7509e-03]
    [5.9891e-03 3.1776e-03 5.6038e-03 ... 8.8043e-03 8.3008e-03
     4.5319e-03]
    ...
    [6.8665e-03 4.7913e-03 7.3395e-03 ... 8.0261e-03 8.6441e-03
     1.3336e-02]
    [4.6959e-03 6.2408e-03 1.1009e-02 ... 9.8572e-03 4.4327e-03
     2.0828e-02]
    [9.8038e-03 1.0284e-02 6.5498e-03 ... 6.3591e-03 3.7766e-03
     1.7044e-02]]

   [[5.3040e-02 1.3161e-03 1.7190e-04 ... 6.0368e-04 1.1358e-03
     3.9101e-04]
    [7.1831e-03 9.4910e-03 6.9351e-03 ... 1.1551e-02 6.7558e-03
     4.5204e-03]
    [1.4925e-03 1.4868e-03 2.0172e-02 ... 2.1591e-03 2.7664e-02
     4.5967e-03]
    ...
    [2.1954e-03 1.3142e-03 3.8242e-03 ... 7.9956e-03 6.5186e-02
     3.2864e-03]
    [3.5238e-04 1.5736e-03 1.4427e-02 ... 5.4665e-03 1.5121e-02
     3.4103e-03]
    [1.9283e-03 3.1166e-03 5.4312e-04 ... 2.6352e-02 1.1215e-02
     3.7594e-03]]

   [[3.4

In [None]:
res = adj_m_to_nx_list(adj_matricies, thresholds_array, ntokens_array)

TypeError: ignored

In [None]:
stats_name.split("_")

In [None]:
import os
from multiprocessing import Pool
from tqdm import tqdm

adj_filenames = [
    output_dir + 'attentions/' + filename
    for filename in os.listdir(output_dir + 'attentions/') if r_file in (output_dir + 'attentions/' + filename)
]
# sorted by part number
adj_filenames = sorted(adj_filenames, key = lambda x: int(x.split('_')[-1].split('of')[0][4:].strip()))
adj_filenames

In [None]:
# What is calculated in "f(v)". You can add any other function from the array with vertex degrees.

def function_for_v(list_of_v_degrees_of_graph):
    return sum(map(lambda x: np.sqrt(x*x), list_of_v_degrees_of_graph))

def split_matricies_and_lengths(adj_matricies, ntokens_array, num_of_workers):
    splitted_adj_matricies = np.array_split(adj_matricies, num_of_workers)
    splitted_ntokens = np.array_split(ntokens_array, num_of_workers)
    assert all([len(m)==len(n) for m, n in zip(splitted_adj_matricies, splitted_ntokens)]), "Split is not valid!"
    return zip(splitted_adj_matricies, splitted_ntokens)

In [None]:
num_of_workers = 20
pool = Pool(num_of_workers)

In [None]:
device='cuda'
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
model = model.to(device)
MAX_LEN = max_tokens_amount

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
print("Total of model parameters", len(list(model.parameters())))

Total of model parameters 201


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
model.eval()
total_metric = 0
for i in tqdm(range(number_of_batches)):
    inputs = tokenizer.batch_encode_plus([text_preprocessing(s) for s in batched_sentences[i]],
                                       return_tensors='pt',
                                       add_special_tokens=True,
                                       max_length=MAX_LEN,             # Max length to truncate/pad
                                       pad_to_max_length=True,         # Pad sentence to max length)
                                       truncation=True
                                      )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs["token_type_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    with torch.no_grad():
          logits = model(input_ids, attention_mask).logits
          pred_labels = torch.argmax(logits, dim=1)
          pred_labels = pred_labels.detach().cpu().numpy()
          true_labels = test_cola_labels.values[i:i+batch_size]
          total_metric += accuracy_score(pred_labels, true_labels)

In [None]:
model.eval()
N_test = len(padded_test_data)
n_batch = N_test // batch_size
total_metric = 0

for i in range(0, N_test, batch_size):
      x_batch = torch.tensor(padded_test_data[i:i+batch_size]).to(device)
      mask_batch = torch.tensor(mask_test[i:i+batch_size]).to(device)
      labels_batch = torch.tensor(test_cola_labels.values[i:i+batch_size]).to(device)
      with torch.no_grad():
          logits = model(x_batch, attention_mask=mask_batch).logits
          pred_labels = torch.argmax(logits, dim=1)
          pred_labels = pred_labels.detach().cpu().numpy()
          true_labels = test_cola_labels.values[i:i+batch_size]
          total_metric += matthews_corrcoef(pred_labels, true_labels)
print("Average metric value on the test set:", total_metric / n_batch)