# Metric Extraction

Now I need to extract the metrics that I want to measure and check if the have any relation to the convergence speed.

In [1]:
# %pip install numpy
# %pip install pandas
# %pip install tqdm
# %pip install scipy
import os
import time
import math
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import dok_matrix

import warnings
warnings.filterwarnings("ignore")

In [26]:
FILENAME = "./matrices2_final.csv"

In [2]:
def get_number_of_edges(matrix: dok_matrix) -> int:
    """
    Gets the number of edges in a given matrix.

    :param matrix: matrix to count the edges of
    :return: the number of edges
    """
    return matrix.count_nonzero()

In [3]:
def get_number_of_nodes(matrix: dok_matrix) -> int:
    """
    Gets the number of nodes (states) in a given matrix.

    :param matrix: matrix to count the nodes of
    :return: the number of nodes
    """
    return matrix.get_shape()[0] ** 2

In [4]:
def get_in_degree(degree_dict: dict, n: int) -> tuple:
    """
    Calculate the average and maximum in-degree of a given matrix.

    :param degree_dict: in degree dictionary
    :param n: number of states o the matrix
    :return: a tuple - (average, max), in-degree of the matrix
    """
    total_sum = 0
    maximum = 0
    for key in degree_dict:
        total_sum += degree_dict[key]
        maximum = max(degree_dict[key], maximum)
    return (total_sum / n), maximum

In [5]:
def get_out_degree(degree_dict: dict, n: int) -> tuple:
    """
    Calculate the average and maximum out-degree of a given matrix.

    :param degree_dict: out degree dictionary
    :param n: number of states o the matrix
    :return: a tuple - (average, max), out-degree of the matrix
    """
    total_sum = 0
    maximum = 0
    for key in degree_dict:
        total_sum += degree_dict[key]
        maximum = max(degree_dict[key], maximum)
    return (total_sum / n), maximum

In [6]:
def get_sle(matrix: dok_matrix) -> int:
    """
    Get the second largest eigenvalue for a given matrix. We take the second largest because the largest is 1 for all of them and this does not give us any information. The second largest will also be the slowest to converge compared to the rest.

    :param matrix: matrix to get the eigenvalue of
    :return: the second largest eigenvalue
    """
    values, vectors = np.linalg.eig(matrix.toarray())
    values.sort()
    return values[-2]

In [7]:
def get_diameter_radius(from_to: dict, n: int) -> tuple:
    """
    Get the diameter and the radius of a matrix. The eccentricity is the longest hopcount between a node and any other node. The diameter is the largest eccentricity and the radius is the smallest eccentricity.

    :param from_to: dictionary with transitions
    :param n: number of states o the matrix
    :return: (diameter, radius)
    """
    eccentricity = []
    for start_node in range(n):
        max_hopcount = 0
        for destination_node in range(n):
            hop_groups = [from_to[start_node]]
            found = False
            while not found:
                if destination_node in hop_groups[-1]:
                    max_hopcount = max(max_hopcount, len(hop_groups))
                    found = True
                else:
                    hop_group = set()
                    for node in hop_groups[-1]:
                        hop_group.update(from_to[node])
                    hop_groups.append(list(hop_group))
        eccentricity.append(max_hopcount)
    return max(eccentricity), min(eccentricity)

In [8]:
def get_necessary_metrics(matrix: dok_matrix, n: int):
    """
    Helping function. Reduces the times that I need to loop through the matrix.

    :param matrix: to get the metrics for
    :param n: number of states
    :return:
    """
    from_to = dict()
    out_degree_dict = dict()
    in_degree_dict = dict()
    for i in range(n):
        for j in range(n):
            if matrix[i, j] != 0:
                if i in from_to:
                    from_to[i].append(j)
                else:
                    from_to[i] = [j]
                if j in out_degree_dict:
                    out_degree_dict[j] += 1
                else:
                    out_degree_dict[j] = 1
                if i in in_degree_dict:
                    in_degree_dict[i] += 1
                else:
                    in_degree_dict[i] = 1

    return from_to, in_degree_dict, out_degree_dict

In [9]:
def calculate_dif(vector1: list, vector2: list) -> float:
    """
    Calculate the absolute difference between the two vectors.

    :param vector1:
    :param vector2:
    :return:
    """
    sum = 0
    for i in range(len(vector1)):
        sum += abs(vector2[i] - vector1[i])
    return sum

In [10]:
def get_index(vector: list, val: complex) -> int:
    """
    Get the index of a value in a vector. Rounding after 15 decimals because of rounding error.

    :param vector:
    :param val:
    :return: the index of the value in the vector or -1 if not found
    """
    for i in range(len(vector)):
        if round(vector[i].real, 15) == val.real and vector[i].imag == val.imag:
            return i
    return 0
    return -1

In [11]:
def get_convergence_speed(matrix: dok_matrix, n: int) -> float:
    """
    Calculate the convergence speed of the matrix.

    :param matrix: matrix to calculate the convergence speed for
    :param n: number of states of the matrix
    :return: the convergence speed, iteration count
    """

    epsilon = 1 * (10 ** -4)

    new_matrix = dok_matrix(matrix - np.identity(n))

    vector = np.zeros(n)
    vector[0] = 1

    next_vector = new_matrix.dot(vector)

    starting_time = time.time()
    iter_count = 1
    while calculate_dif(vector, next_vector) >= epsilon:
        vector, next_vector = next_vector, new_matrix.dot(next_vector)
        iter_count += 1
    ending_time = time.time()
    return ending_time - starting_time, iter_count

In [24]:
def get_all_matrix_data(matrix: dok_matrix) -> list:
    n = matrix.get_shape()[0]
    from_to, in_degree_dict, out_degree_dict = get_necessary_metrics(matrix, n)
    row = []
    row.append(f"{n}x{n}")
    row.append(get_number_of_nodes(matrix))
    row.append(get_number_of_edges(matrix))
    diameter, radius = get_diameter_radius(from_to, n)
    row.append(diameter)
    row.append(radius)
    average_in, max_in = get_in_degree(in_degree_dict, n)
    average_out, max_out = get_out_degree(out_degree_dict, n)
    row.append(average_in)
    row.append(max_in)
    row.append(max_out)
    row.append(get_sle(matrix))
    time, iterations = get_convergence_speed(matrix, n)
    row.append(time)
    row.append(iterations)
    return row

In [27]:
# sle = second largest eigenvalue
COLUMNS = ['matrix', 'number_of_nodes', 'number_of_edges', 'diameter', 'radius',
           'average_deg', 'max_in_deg', 'max_out_deg',
           'sle', 'convergence_speed', 'convergence_iterations']
data = pd.DataFrame(columns=COLUMNS)
DIR = './data/generated2'

for filename in tqdm(os.listdir(DIR)):
    f = os.path.join(DIR, filename)

    if os.path.isfile(f):
        with open(f, "rb") as file:
            matrices = pickle.load(file)
            for matrix in matrices:
                row = get_all_matrix_data(matrix)
                new_data = pd.DataFrame(data=[row], columns=COLUMNS)
                data = pd.concat([data, new_data], ignore_index=True)

data.to_csv(FILENAME)

100%|██████████| 190/190 [17:32<00:00,  5.54s/it]


In [None]:
data = pd.read_csv(FILENAME)
data.drop(['Unnamed: 0'], axis=1, inplace=True)
norm_eigenvalues = []
for eigenval in data['sle']:
    num = complex(eigenval)
    norm_eigenvalues.append(math.sqrt(num.real ** 2 + num.imag ** 2))
data['norm_sle'] = norm_eigenvalues
data.to_csv(FILENAME)
data

In [15]:
data.tail(10)

Unnamed: 0,matrix,number_of_nodes,number_of_edges,diameter,radius,average_in_deg,average_out_deg,max_in_deg,max_out_deg,sle,convergence_speed,convergence_iterations
1890,99x99,9801,7425,2,2,75.0,75.0,75,85,(0.09139257596053166+0.014212153789412907j),23.700882,8229
1891,99x99,9801,6930,2,2,70.0,70.0,70,79,(0.10166385359350022+0j),20.956913,7744
1892,99x99,9801,6435,2,2,65.0,65.0,65,75,(0.10951640414446907+0j),18.323644,7275
1893,99x99,9801,5940,2,2,60.0,60.0,60,72,(0.10820041711862757+0j),17.221383,7338
1894,99x99,9801,5445,2,2,55.0,55.0,55,70,(0.11799701394008684+0.02087828516530641j),13.784979,6445
1895,99x99,9801,4950,2,2,50.0,50.0,50,63,(0.12199236082927312+0.038061319278715595j),12.893341,6564
1896,99x99,9801,4455,2,2,45.0,45.0,45,56,(0.14368227971847636+0j),10.052581,5686
1897,99x99,9801,3960,2,2,40.0,40.0,40,51,(0.14374417480060664+0.003189491423819522j),8.008495,4989
1898,99x99,9801,3465,2,2,35.0,35.0,35,47,(0.1700483210899803+0.012543751482633824j),6.791701,4740
1899,99x99,9801,2970,2,2,30.0,30.0,30,45,(0.18149022596603726+0.03194380184124332j),4.938289,3975


In [23]:
data.to_csv('./matrices2_final.csv')

## Extract the metrics for the downloaded files

In [None]:
def is_ergodic(matrix, n):
    """
    An ergodic matrix is aperiodic and irreducible. By Wielandt's theorem if when the matrix is multiplied by itself m
    times, where m = (n - 1) * (n - 1) + 1, and all its entries are positive then the matrix is ergodic. n is the number
    of sates.

    :param matrix: matrix to check
    :return: true if the matrix is ergodic, false otherwise
    """
    matrix = matrix.tocsr(copy=True)
    m = (n - 1) * (n - 1) + 1
    multiplicities = [matrix]
    for i in range(int(math.log(m, 2))):
        matrix = matrix.dot(matrix)
        multiplicities.append(matrix)
    index = len(multiplicities) - 1
    res = None
    while m > 0:
        if m & 1:
            if res is None:
                res = multiplicities[index]
            else:
                res = res.dot(multiplicities[index])
        index -= 1
        m = m >> 1
    return res.count_nonzero() == n * n

In [None]:
COLUMNS_DOWNLOAD = ['matrix_name', 'number_of_nodes', 'number_of_edges', 'diameter', 'radius', 'average_in_deg', 'average_out_deg', 'max_in_deg', 'max_out_deg', 'sle']
downloaded_data = pd.DataFrame(columns=COLUMNS_DOWNLOAD)
DIR_DOWNLOAD = './data/downloaded'

i = 1
for filename in os.listdir(DIR_DOWNLOAD):
    f = os.path.join(DIR_DOWNLOAD, filename)
    if not filename.endswith('.pickle'):
        continue
    if os.path.isfile(f):
        print(f"{i}: {filename}")
        i += 1
        with open(f, "rb") as file:
            try:
                matrix = pickle.load(file)
                n = matrix.get_shape()[0]
                if not is_ergodic(matrix, n):
                    print("Not Ergodic")
                    continue
                row = get_all_matrix_data(matrix)
            except MemoryError:
                print(f"File {filename}'s matrix is too big ({n}x{n})")
                continue
            row[0] = filename
            new_data = pd.DataFrame(data=[row], columns=COLUMNS_DOWNLOAD)
            downloaded_data = pd.concat([downloaded_data, new_data], ignore_index=True)
downloaded_data.to_csv("downloaded_with_metrics.csv")

In [None]:
downloaded_data = downloaded_data.sort_values(by=['number_of_nodes'], ignore_index=True)
downloaded_data.head()

In [None]:
downloaded_data.tail()

In [None]:
downloaded_data.to_csv("downloaded_with_metrics.csv")