# Benchmarks con el dataset de UM

El siguiente Script calculara benchmarks usando el dataset de la Universidad de Murcia, creando datos sinteticos a partir de los datos reales

## Importaciones

In [2]:
import xml.dom.minidom
import re
import uuid 
from datetime import datetime
import pickle
from pathlib import Path
from bs4 import BeautifulSoup as BS
import shutil
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import json
import scipy

## Constantes

Expresiones Regulares

In [3]:
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))/"""

Constantes

In [4]:
dataNumber = [50000,250000,500000,1000000,5000000] # volumen de paquetes de datos
# dataNumber = [50000,250000,500000,1000000] # numero de datos
base_dir = './data/sintetic_data/' # directorio base de datos
GENERATE_NEW_DATA = True;
GENERATE_NEW_TDB2_MEAUSERES = True
GENERATE_NEW_BLAZEGRAPH_MEAUSERES = False

## Funciones

Leer datos semilla

In [5]:
def parseRDF(path,mode,encoding='utf8'): # Read file data
    with open(path,mode, encoding=encoding) as f:
        types = set()
        RDFs = []
        rdf = ""
        in_description = False
        for line in f: # Por cada linea
            if (line.startswith("<rdf:RDF")): # Si es el principio de cada XML
                rdf = ""
                in_description = False
            elif (line.startswith("</rdf:RDF>")): # Si es el final de cada XML
                RDFs.append(rdf)
                rdf = ""
            else:
                if (line.startswith("  <rdf:Description")>0):
                    in_description = True
                elif (line.startswith("  </rdf:Description")):
                    rdf = rdf + str(line.encode(encoding, 'replace'))[2:-1].replace("\\n","\n")
                    in_description = False
                if in_description:
                    if ("<rdf:type" in line):
                        res = re.findall(URL_REGEX,line)[0].split("/")[-1] if len(re.findall(URL_REGEX,line))>0 else None
                        types.add(res)
                    rdf = rdf + str(line.encode(encoding, 'replace'))[2:-1].replace("\\n","\n") # Si es el medio del XML
        return RDFs, types


Tamaño en bytes de un string

In [6]:
def utf8len(s):
    return len(s.encode('utf-8'))

Expresion regular para identificar UUIDs

In [7]:
def create_uuid_pattern(version): # Funcion para crear regex para identificar uuid
    return re.compile(
        (
            '[a-f0-9]{8}-' +
            '[a-f0-9]{4}-' +
            version + '[a-f0-9]{3}-' +
            '[89ab][a-f0-9]{3}-' +
            '[a-f0-9]{12}'
        ),
        re.IGNORECASE
    )

Función para generar datos sinteticos (n) a partir de datos reales (data)

In [8]:
def generateData(data,n): # generacion de datos sinteticos a partir de datos reales
    if (len(data)>n):
        return data
    else:
        sintetic_data_colection = []
        sintetic_data = data.copy()
        UUID_ALL_PATTERN = create_uuid_pattern('[1-5]')
        split_size = utf8len("".join(data))
        for i in range(len(data),n): # hasta alcanzar el numero deseado
            index = i%len(data) # index in original data
            id_uuid = UUID_ALL_PATTERN.findall(RDFs[index])[0]
            if (len(id_uuid)>0):
                sintetic_data.append(RDFs[index].replace(id_uuid,str(uuid.uuid1())))
                split_size += utf8len(RDFs[index])
            if (split_size>3900000):
                sintetic_data_colection.append(sintetic_data)
                sintetic_data = []
                split_size = 0
        sintetic_data_colection.append(sintetic_data.copy())
        return sintetic_data_colection
        

Funciones para leer y escribir objetos en ficheros, util para reusar datos sinteticos

In [9]:
def save_obj(path, obj, name ): # Salvar objeto a fichero
    with open(path +'/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(path,name ): # leer objeto desde fichero
    with open(path +'/'+ name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Generación de datos sinteticos hasta máximo

Creación de datos sinteticos para **50K, 250K, 1M, 5M, 25M**

In [11]:
prefix = """<?xml version="1.0" encoding="UTF-8" ?>
    <rdf:RDF
        xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
        xmlns:j.0="http://hercules.org/um/es-ES/rec/">
    """
sufix = """</rdf:RDF>
    """

(RDFs,t) = parseRDF('./data/rdf.log','r',encoding='utf-8')

if (GENERATE_NEW_DATA):
    if (Path(base_dir).is_dir()):
        shutil.rmtree(base_dir)
    os.mkdir(base_dir)
    dataSet = {} # Estructura
    maxData = max(dataNumber)
    print('Generating sintetic data ('+str(maxData)+')')
    now = datetime.now()
    data = generateData(RDFs,maxData)
    later = datetime.now()
    print('Data ('+str(maxData)+') is generated in '+str((later - now).total_seconds()) + ' seconds')
    
    dataLen = {}
    for dn in dataNumber:
        dataLen[dn] = 0
    
    def getDataSet(dataNumber,elements):
        for dn in dataNumber:
            if dn > elements:
                return dn
    
    counter = 0
    counterElements = 0;
    for d in data:
        counter += 1
        dataSet = getDataSet(dataLen,counterElements)
        if (dataSet-counterElements)>=len(d): # Si caben todos los elementos
            file_name = f'{base_dir}dataset_{dataSet}_number_{counter}_from_{counterElements+1}_to_{(counterElements+len(d))}.txt'
            f = open(file_name, "w")
            f.write(prefix+"".join(d)+sufix)
            f.close()
            dataLen[dataSet] = dataLen[dataSet] + len(d)
            counterElements += len(d)
            print("Create file "+file_name + ' in Set ' + str(dataSet) + ' with '+ str(len(d)) + ', new elements, remaining space available: ' + str(dataLen[dataSet]))
        else:
            size_free = dataSet-counterElements
            allowed_data = d[:size_free]
            file_name = f'{base_dir}dataset_{dataSet}_number_{counter}_from_{counterElements+1}_to_{(counterElements+len(allowed_data))}.txt'
            f = open(file_name, "w")
            f.write(prefix+("".join(allowed_data))+sufix)
            f.close()
            counterElements += len(allowed_data)
            print("Create file with allowed "+file_name + ' in Set ' + str(dataSet) + ' with '+ str(len(allowed_data)) + ', new elements, remaining space available: ' + str(dataSet-counterElements))

            # Datos restantes
            remain_data = d[size_free:]
            counter += 1
            nextDataNumber = dataNumber[dataNumber.index(dataSet)+1] if dataNumber.index(dataSet)+1 < len(dataNumber) else dataNumber.index(dataSet)
            file_name = f'{base_dir}dataset_{nextDataNumber}_number_{counter}_from_{counterElements+1}_to_{(counterElements+len(remain_data))}.txt'
            f = open(file_name, "w")
            f.write(prefix+"".join(remain_data)+sufix)
            f.close()
            counterElements += len(remain_data)
            print("Create file with remain "+file_name + ' in Set ' + str(nextDataNumber) + ' with '+ str(len(remain_data)) + ', new elements, remaining space available: ' + str(nextDataNumber-counterElements) + ' int same iteration')
    

Generating sintetic data (5000000)
Data (5000000) is generated in 221.777097 seconds
Create file ./data/sintetic_data/dataset_50000_number_1_from_1_to_6502.txt in Set 50000 with 6502, new elements, remaining space available: 6502
Create file ./data/sintetic_data/dataset_50000_number_2_from_6503_to_13057.txt in Set 50000 with 6555, new elements, remaining space available: 13057
Create file ./data/sintetic_data/dataset_50000_number_3_from_13058_to_19636.txt in Set 50000 with 6579, new elements, remaining space available: 19636
Create file ./data/sintetic_data/dataset_50000_number_4_from_19637_to_26152.txt in Set 50000 with 6516, new elements, remaining space available: 26152
Create file ./data/sintetic_data/dataset_50000_number_5_from_26153_to_32731.txt in Set 50000 with 6579, new elements, remaining space available: 32731
Create file ./data/sintetic_data/dataset_50000_number_6_from_32732_to_39279.txt in Set 50000 with 6548, new elements, remaining space available: 39279
Create file ./da

Create file ./data/sintetic_data/dataset_500000_number_55_from_340613_to_347206.txt in Set 500000 with 6594, new elements, remaining space available: 91713
Create file ./data/sintetic_data/dataset_500000_number_56_from_347207_to_353726.txt in Set 500000 with 6520, new elements, remaining space available: 98233
Create file ./data/sintetic_data/dataset_500000_number_57_from_353727_to_360265.txt in Set 500000 with 6539, new elements, remaining space available: 104772
Create file ./data/sintetic_data/dataset_500000_number_58_from_360266_to_366875.txt in Set 500000 with 6610, new elements, remaining space available: 111382
Create file ./data/sintetic_data/dataset_500000_number_59_from_366876_to_373377.txt in Set 500000 with 6502, new elements, remaining space available: 117884
Create file ./data/sintetic_data/dataset_500000_number_60_from_373378_to_379929.txt in Set 500000 with 6552, new elements, remaining space available: 124436
Create file ./data/sintetic_data/dataset_500000_number_61_fr

Create file ./data/sintetic_data/dataset_1000000_number_107_from_674748_to_681329.txt in Set 1000000 with 6582, new elements, remaining space available: 176929
Create file ./data/sintetic_data/dataset_1000000_number_108_from_681330_to_687838.txt in Set 1000000 with 6509, new elements, remaining space available: 183438
Create file ./data/sintetic_data/dataset_1000000_number_109_from_687839_to_694420.txt in Set 1000000 with 6582, new elements, remaining space available: 190020
Create file ./data/sintetic_data/dataset_1000000_number_110_from_694421_to_700965.txt in Set 1000000 with 6545, new elements, remaining space available: 196565
Create file ./data/sintetic_data/dataset_1000000_number_111_from_700966_to_707489.txt in Set 1000000 with 6524, new elements, remaining space available: 203089
Create file ./data/sintetic_data/dataset_1000000_number_112_from_707490_to_714082.txt in Set 1000000 with 6593, new elements, remaining space available: 209682
Create file ./data/sintetic_data/dataset

Create file ./data/sintetic_data/dataset_5000000_number_160_from_1015412_to_1021948.txt in Set 5000000 with 6537, new elements, remaining space available: 19653
Create file ./data/sintetic_data/dataset_5000000_number_161_from_1021949_to_1028558.txt in Set 5000000 with 6610, new elements, remaining space available: 26263
Create file ./data/sintetic_data/dataset_5000000_number_162_from_1028559_to_1035057.txt in Set 5000000 with 6499, new elements, remaining space available: 32762
Create file ./data/sintetic_data/dataset_5000000_number_163_from_1035058_to_1041608.txt in Set 5000000 with 6551, new elements, remaining space available: 39313
Create file ./data/sintetic_data/dataset_5000000_number_164_from_1041609_to_1048209.txt in Set 5000000 with 6601, new elements, remaining space available: 45914
Create file ./data/sintetic_data/dataset_5000000_number_165_from_1048210_to_1054712.txt in Set 5000000 with 6503, new elements, remaining space available: 52417
Create file ./data/sintetic_data/d

Create file ./data/sintetic_data/dataset_5000000_number_213_from_1362659_to_1369177.txt in Set 5000000 with 6519, new elements, remaining space available: 366882
Create file ./data/sintetic_data/dataset_5000000_number_214_from_1369178_to_1375768.txt in Set 5000000 with 6591, new elements, remaining space available: 373473
Create file ./data/sintetic_data/dataset_5000000_number_215_from_1375769_to_1382296.txt in Set 5000000 with 6528, new elements, remaining space available: 380001
Create file ./data/sintetic_data/dataset_5000000_number_216_from_1382297_to_1388832.txt in Set 5000000 with 6536, new elements, remaining space available: 386537
Create file ./data/sintetic_data/dataset_5000000_number_217_from_1388833_to_1395425.txt in Set 5000000 with 6593, new elements, remaining space available: 393130
Create file ./data/sintetic_data/dataset_5000000_number_218_from_1395426_to_1401941.txt in Set 5000000 with 6516, new elements, remaining space available: 399646
Create file ./data/sintetic_

Create file ./data/sintetic_data/dataset_5000000_number_264_from_1696749_to_1703292.txt in Set 5000000 with 6544, new elements, remaining space available: 700997
Create file ./data/sintetic_data/dataset_5000000_number_265_from_1703293_to_1709899.txt in Set 5000000 with 6607, new elements, remaining space available: 707604
Create file ./data/sintetic_data/dataset_5000000_number_266_from_1709900_to_1716401.txt in Set 5000000 with 6502, new elements, remaining space available: 714106
Create file ./data/sintetic_data/dataset_5000000_number_267_from_1716402_to_1722957.txt in Set 5000000 with 6556, new elements, remaining space available: 720662
Create file ./data/sintetic_data/dataset_5000000_number_268_from_1722958_to_1729538.txt in Set 5000000 with 6581, new elements, remaining space available: 727243
Create file ./data/sintetic_data/dataset_5000000_number_269_from_1729539_to_1736052.txt in Set 5000000 with 6514, new elements, remaining space available: 733757
Create file ./data/sintetic_

Create file ./data/sintetic_data/dataset_5000000_number_317_from_2043980_to_2050510.txt in Set 5000000 with 6531, new elements, remaining space available: 1048215
Create file ./data/sintetic_data/dataset_5000000_number_318_from_2050511_to_2057104.txt in Set 5000000 with 6594, new elements, remaining space available: 1054809
Create file ./data/sintetic_data/dataset_5000000_number_319_from_2057105_to_2063624.txt in Set 5000000 with 6520, new elements, remaining space available: 1061329
Create file ./data/sintetic_data/dataset_5000000_number_320_from_2063625_to_2070163.txt in Set 5000000 with 6539, new elements, remaining space available: 1067868
Create file ./data/sintetic_data/dataset_5000000_number_321_from_2070164_to_2076773.txt in Set 5000000 with 6610, new elements, remaining space available: 1074478
Create file ./data/sintetic_data/dataset_5000000_number_322_from_2076774_to_2083275.txt in Set 5000000 with 6502, new elements, remaining space available: 1080980
Create file ./data/sin

Create file ./data/sintetic_data/dataset_5000000_number_369_from_2384646_to_2391227.txt in Set 5000000 with 6582, new elements, remaining space available: 1388932
Create file ./data/sintetic_data/dataset_5000000_number_370_from_2391228_to_2397736.txt in Set 5000000 with 6509, new elements, remaining space available: 1395441
Create file ./data/sintetic_data/dataset_5000000_number_371_from_2397737_to_2404318.txt in Set 5000000 with 6582, new elements, remaining space available: 1402023
Create file ./data/sintetic_data/dataset_5000000_number_372_from_2404319_to_2410863.txt in Set 5000000 with 6545, new elements, remaining space available: 1408568
Create file ./data/sintetic_data/dataset_5000000_number_373_from_2410864_to_2417387.txt in Set 5000000 with 6524, new elements, remaining space available: 1415092
Create file ./data/sintetic_data/dataset_5000000_number_374_from_2417388_to_2423980.txt in Set 5000000 with 6593, new elements, remaining space available: 1421685
Create file ./data/sin

Create file ./data/sintetic_data/dataset_5000000_number_419_from_2712194_to_2718785.txt in Set 5000000 with 6592, new elements, remaining space available: 1716490
Create file ./data/sintetic_data/dataset_5000000_number_420_from_2718786_to_2725309.txt in Set 5000000 with 6524, new elements, remaining space available: 1723014
Create file ./data/sintetic_data/dataset_5000000_number_421_from_2725310_to_2731846.txt in Set 5000000 with 6537, new elements, remaining space available: 1729551
Create file ./data/sintetic_data/dataset_5000000_number_422_from_2731847_to_2738456.txt in Set 5000000 with 6610, new elements, remaining space available: 1736161
Create file ./data/sintetic_data/dataset_5000000_number_423_from_2738457_to_2744955.txt in Set 5000000 with 6499, new elements, remaining space available: 1742660
Create file ./data/sintetic_data/dataset_5000000_number_424_from_2744956_to_2751506.txt in Set 5000000 with 6551, new elements, remaining space available: 1749211
Create file ./data/sin

Create file ./data/sintetic_data/dataset_5000000_number_474_from_3072557_to_3079075.txt in Set 5000000 with 6519, new elements, remaining space available: 2076780
Create file ./data/sintetic_data/dataset_5000000_number_475_from_3079076_to_3085666.txt in Set 5000000 with 6591, new elements, remaining space available: 2083371
Create file ./data/sintetic_data/dataset_5000000_number_476_from_3085667_to_3092194.txt in Set 5000000 with 6528, new elements, remaining space available: 2089899
Create file ./data/sintetic_data/dataset_5000000_number_477_from_3092195_to_3098730.txt in Set 5000000 with 6536, new elements, remaining space available: 2096435
Create file ./data/sintetic_data/dataset_5000000_number_478_from_3098731_to_3105323.txt in Set 5000000 with 6593, new elements, remaining space available: 2103028
Create file ./data/sintetic_data/dataset_5000000_number_479_from_3105324_to_3111839.txt in Set 5000000 with 6516, new elements, remaining space available: 2109544
Create file ./data/sin

Create file ./data/sintetic_data/dataset_5000000_number_527_from_3419798_to_3426299.txt in Set 5000000 with 6502, new elements, remaining space available: 2424004
Create file ./data/sintetic_data/dataset_5000000_number_528_from_3426300_to_3432855.txt in Set 5000000 with 6556, new elements, remaining space available: 2430560
Create file ./data/sintetic_data/dataset_5000000_number_529_from_3432856_to_3439436.txt in Set 5000000 with 6581, new elements, remaining space available: 2437141
Create file ./data/sintetic_data/dataset_5000000_number_530_from_3439437_to_3445950.txt in Set 5000000 with 6514, new elements, remaining space available: 2443655
Create file ./data/sintetic_data/dataset_5000000_number_531_from_3445951_to_3452530.txt in Set 5000000 with 6580, new elements, remaining space available: 2450235
Create file ./data/sintetic_data/dataset_5000000_number_532_from_3452531_to_3459078.txt in Set 5000000 with 6548, new elements, remaining space available: 2456783
Create file ./data/sin

Create file ./data/sintetic_data/dataset_5000000_number_579_from_3760409_to_3767002.txt in Set 5000000 with 6594, new elements, remaining space available: 2764707
Create file ./data/sintetic_data/dataset_5000000_number_580_from_3767003_to_3773522.txt in Set 5000000 with 6520, new elements, remaining space available: 2771227
Create file ./data/sintetic_data/dataset_5000000_number_581_from_3773523_to_3780061.txt in Set 5000000 with 6539, new elements, remaining space available: 2777766
Create file ./data/sintetic_data/dataset_5000000_number_582_from_3780062_to_3786671.txt in Set 5000000 with 6610, new elements, remaining space available: 2784376
Create file ./data/sintetic_data/dataset_5000000_number_583_from_3786672_to_3793173.txt in Set 5000000 with 6502, new elements, remaining space available: 2790878
Create file ./data/sintetic_data/dataset_5000000_number_584_from_3793174_to_3799725.txt in Set 5000000 with 6552, new elements, remaining space available: 2797430
Create file ./data/sin

Create file ./data/sintetic_data/dataset_5000000_number_632_from_4107635_to_4114216.txt in Set 5000000 with 6582, new elements, remaining space available: 3111921
Create file ./data/sintetic_data/dataset_5000000_number_633_from_4114217_to_4120761.txt in Set 5000000 with 6545, new elements, remaining space available: 3118466
Create file ./data/sintetic_data/dataset_5000000_number_634_from_4120762_to_4127285.txt in Set 5000000 with 6524, new elements, remaining space available: 3124990
Create file ./data/sintetic_data/dataset_5000000_number_635_from_4127286_to_4133878.txt in Set 5000000 with 6593, new elements, remaining space available: 3131583
Create file ./data/sintetic_data/dataset_5000000_number_636_from_4133879_to_4140405.txt in Set 5000000 with 6527, new elements, remaining space available: 3138110
Create file ./data/sintetic_data/dataset_5000000_number_637_from_4140406_to_4146944.txt in Set 5000000 with 6539, new elements, remaining space available: 3144649
Create file ./data/sin

Create file ./data/sintetic_data/dataset_5000000_number_684_from_4448355_to_4454853.txt in Set 5000000 with 6499, new elements, remaining space available: 3452558
Create file ./data/sintetic_data/dataset_5000000_number_685_from_4454854_to_4461404.txt in Set 5000000 with 6551, new elements, remaining space available: 3459109
Create file ./data/sintetic_data/dataset_5000000_number_686_from_4461405_to_4468005.txt in Set 5000000 with 6601, new elements, remaining space available: 3465710
Create file ./data/sintetic_data/dataset_5000000_number_687_from_4468006_to_4474508.txt in Set 5000000 with 6503, new elements, remaining space available: 3472213
Create file ./data/sintetic_data/dataset_5000000_number_688_from_4474509_to_4481077.txt in Set 5000000 with 6569, new elements, remaining space available: 3478782
Create file ./data/sintetic_data/dataset_5000000_number_689_from_4481078_to_4487644.txt in Set 5000000 with 6567, new elements, remaining space available: 3485349
Create file ./data/sin

Create file ./data/sintetic_data/dataset_5000000_number_735_from_4782455_to_4788973.txt in Set 5000000 with 6519, new elements, remaining space available: 3786678
Create file ./data/sintetic_data/dataset_5000000_number_736_from_4788974_to_4795564.txt in Set 5000000 with 6591, new elements, remaining space available: 3793269
Create file ./data/sintetic_data/dataset_5000000_number_737_from_4795565_to_4802092.txt in Set 5000000 with 6528, new elements, remaining space available: 3799797
Create file ./data/sintetic_data/dataset_5000000_number_738_from_4802093_to_4808628.txt in Set 5000000 with 6536, new elements, remaining space available: 3806333
Create file ./data/sintetic_data/dataset_5000000_number_739_from_4808629_to_4815221.txt in Set 5000000 with 6593, new elements, remaining space available: 3812926
Create file ./data/sintetic_data/dataset_5000000_number_740_from_4815222_to_4821737.txt in Set 5000000 with 6516, new elements, remaining space available: 3819442
Create file ./data/sin

## Queries

Obtener el numero total de tripletas

In [10]:
Q1 = """PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

DESCRIBE <http://hercules.org/um/es-ES/rec/AnualidadFinanciacionRegistroAyudaDefinitiva/e73fc9ee-382e-4a83-b9d5-58d3c45c5d81>"""

Q1_N = """
DESCRIBE <http://hercules.org/um/es-ES/rec/AnualidadFinanciacionRegistroAyudaDefinitiva/e73fc9ee-382e-4a83-b9d5-58d3c45c5d81>"""


# COUNT: Contar el numero de tripletas
Q2 = """PREFIX un: <http://www.w3.org/2007/ont/unit#>
PREFIX uni: <http://purl.org/weso/uni/uni.html#>
prefix univ:<http://people.brunel.ac.uk/~csstnns/university.owl#>
prefix sp:<http://www.meta-qsar.org/ontologies/sport.owl#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT (COUNT(*) as ?Triples) WHERE { ?s ?p ?o}"""

Q2_N = """SELECT (COUNT(*) as ?Triples) WHERE { ?s ?p ?o}"""

# FILTER: WHERE idPersona = 2211
Q3 = """PREFIX un: <http://www.w3.org/2007/ont/unit#>
PREFIX uni: <http://purl.org/weso/uni/uni.html#>
prefix univ:<http://people.brunel.ac.uk/~csstnns/university.owl#>
prefix sp:<http://www.meta-qsar.org/ontologies/sport.owl#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?s ?p ?o
WHERE {
  ?s <http://hercules.org/um/es-ES/rec/idPersona> ?o
  FILTER(?o = "2211")
}"""

Q3_N = """SELECT ?s ?p ?o
WHERE {
  ?s <http://hercules.org/um/es-ES/rec/idPersona> ?o
  FILTER(?o = "2211")
}"""

# DISTINCT: Personas distintas
Q4 = """PREFIX un: <http://www.w3.org/2007/ont/unit#>
PREFIX uni: <http://purl.org/weso/uni/uni.html#>
prefix univ:<http://people.brunel.ac.uk/~csstnns/university.owl#>
prefix sp:<http://www.meta-qsar.org/ontologies/sport.owl#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?o WHERE { ?s <http://hercules.org/um/es-ES/rec/idPersona> ?o }"""

Q4_N = """SELECT DISTINCT ?o WHERE { ?s <http://hercules.org/um/es-ES/rec/idPersona> ?o }"""


queries = [Q1,Q2,Q3,Q4]

queries_no_prefix = [Q1_N,Q2_N,Q3_N,Q4_N]

## General Triple Stores

Método para realizar la petición Post para 1 fichero

In [11]:
def sendDataFromFile(filePath,host='localhost',port=3030,url='/trellis/data?graph=trellis'):
    import http.client
    import mimetypes
    conn = http.client.HTTPConnection(host, port)
    dataList = []
    boundary = 'wL36Yn8afVp8Ag7AmP8qZ0SA4n1v9T'
    dataList.append('--' + boundary)
    dataList.append('Content-Disposition: form-data; name=file; filename={0}'.format(filePath))

    fileType = mimetypes.guess_type(filePath)[0] or 'application/octet-stream'
    dataList.append('Content-Type: {}'.format(fileType))
    dataList.append('')

    with open(filePath) as f:
      dataList.append(f.read())
    dataList.append('--'+boundary+'--')
    dataList.append('')
    body = '\r\n'.join(dataList)
    payload = body
    headers = {
       'Content-type': 'multipart/form-data; boundary={}'.format(boundary) 
    }
    conn.request("POST", url, payload, headers)
    res = conn.getresponse()
    data = res.read()
    return data.decode("utf-8")

In [12]:
def sendDataFromFileBlazegraph(filePath,host='localhost',port=8889,url='/bigdata/sparql'):
    import requests
    dataList = []
    with open(filePath) as f:
        dataList.append(f.read())
    body = '\r\n'.join(dataList)
    payload = body
    headers = {
      'Content-Type': 'application/rdf+xml'
    }
    response = requests.request("POST", 'http://'+host+':'+str(port)+url, headers=headers, data = payload)

    return  response.text

In [13]:
#sendDataFromFileBlazegraph(filePath='./data/sintetic_data/dataset_50000_number_1_from_1_to_6502.txt')

Método para realizar una query

In [14]:
def sendQuery(query,host='localhost',port=3030,url='/trellis/data?graph=trellis'):
    import http.client
    import mimetypes
    import urllib.parse
    conn = http.client.HTTPConnection("localhost", 3030)
    payload = f'query={query}'
    headers = {
      'Content-Type': 'application/x-www-form-urlencoded',
      'Accept': 'application/json'
    }
    conn.request("POST", "/trellis/sparql", payload, headers)
    res = conn.getresponse()
    data = res.read()
    return data.decode("utf-8")

In [15]:
def sendQueryBlazegraph(query,host='localhost',port=8889,url='/bigdata/sparql'):
    import requests

    url = f"http://{host}:{port}/bigdata/sparql?query={query}"

    payload = {}
    headers = {
      'Accept': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data = payload)

    return response.text


### FUSEKI + TDB2

In [23]:
if GENERATE_NEW_TDB2_MEAUSERES:
    updateTime = []
    queryTime = []
    for dn in dataNumber:

        nowUpdate = datetime.now() # Comienzo de momento actual para cada grupo

        # Insercion
        for f in os.listdir(base_dir):
            setData = int(re.search(r'\d+', f).group()) # Obtengo grupo

            if dn == setData: # Si pertenece al grupo
                res = sendDataFromFile(filePath=f'{base_dir}{f}') # Envio el dato
                print(dn)
        # Finalizado el grupo
        laterUpdate = datetime.now() # Tiempo final para el grupo

            # Tiempo empleado para Updates
        deltaUpdate = laterUpdate - nowUpdate
        if len(updateTime)>0: # Si procese la iteracion update anterior
            updateTime.append({'set': dn,'start': nowUpdate, 'end': laterUpdate ,'deltaTime': deltaUpdate.total_seconds(),'acumDeltaTime': deltaUpdate.total_seconds() + updateTime[len(updateTime)-1]['acumDeltaTime']})
        else:
            updateTime.append({'set': dn,'start': nowUpdate, 'end': laterUpdate ,'deltaTime': deltaUpdate.total_seconds(),'acumDeltaTime': deltaUpdate.total_seconds()})

        # Tiempo empleado para Queries
        counter_q = 0
        queryTimeObj = {'set': dn, 'queries':[] }
        for q in queries:
            counter_q += 1
            now_q = datetime.now()
            result = sendQuery(q)
            later_q = datetime.now()
            delta_q = later_q - now_q
            queryTimeObj['queries'].append({'start': now_q, 'end': later_q, 'deltaTime': delta_q.total_seconds(), 'queryId': f'Q{counter_q}', 'query': q})
        queryTime.append(queryTimeObj)

    import pandas as pd
    udf = pd.DataFrame(columns=['elements','delta_time_seg','amount_delta_time_seg'])
    for ut in updateTime:
        udf = udf.append({'elements': ut['set'],'delta_time_seg': ut['deltaTime'],'amount_delta_time_seg': ut['acumDeltaTime']}, ignore_index=True)
    udf.to_csv('./results/updateMetricFuseki.csv', index=False)
    qdf = pd.DataFrame(columns=['elements','Q1_delta_time_seg','Q2_delta_time_seg','Q3_delta_time_seg','Q4_delta_time_seg'])
    for qt in queryTime:
        qtObj = {'elements':qt['set']}
        for q in qt['queries']:
            qtObj[f"{q['queryId']}_delta_time_seg"]=q['deltaTime']
        qdf = qdf.append(qtObj, ignore_index=True)
    qdf.to_csv('./results/queriesMetricFuseki.csv', index=False)

50000
50000
50000
50000
50000
50000
50000
50000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
250000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
500000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000

### BlazeGraph

In [47]:
if GENERATE_NEW_BLAZEGRAPH_MEAUSERES:
    updateTime = []
    queryTime = []
    for dn in dataNumber:

        nowUpdate = datetime.now() # Comienzo de momento actual para cada grupo

        # Insercion
        for f in os.listdir(base_dir):
            setData = int(re.search(r'\d+', f).group()) # Obtengo grupo

            if dn == setData: # Si pertenece al grupo
                res = sendDataFromFileBlazegraph(filePath=f'{base_dir}{f}') # Envio el dato
                print(res)
        # Finalizado el grupo
        laterUpdate = datetime.now() # Tiempo final para el grupo

            # Tiempo empleado para Updates
        deltaUpdate = laterUpdate - nowUpdate
        if len(updateTime)>0: # Si procese la iteracion update anterior
            updateTime.append({'set': dn,'start': nowUpdate, 'end': laterUpdate ,'deltaTime': deltaUpdate.total_seconds(),'acumDeltaTime': deltaUpdate.total_seconds() + updateTime[len(updateTime)-1]['acumDeltaTime']})
        else:
            updateTime.append({'set': dn,'start': nowUpdate, 'end': laterUpdate ,'deltaTime': deltaUpdate.total_seconds(),'acumDeltaTime': deltaUpdate.total_seconds()})

        # Tiempo empleado para Queries
        counter_q = 0
        queryTimeObj = {'set': dn, 'queries':[] }
        for q in queries_no_prefix:
            counter_q += 1
            now_q = datetime.now()
            result = sendQueryBlazegraph(q)
            later_q = datetime.now()
            delta_q = later_q - now_q
            queryTimeObj['queries'].append({'start': now_q, 'end': later_q, 'deltaTime': delta_q.total_seconds(), 'queryId': f'Q{counter_q}', 'query': q})
        queryTime.append(queryTimeObj)

    import pandas as pd
    udf = pd.DataFrame(columns=['elements','delta_time_seg','amount_delta_time_seg'])
    for ut in updateTime:
        udf = udf.append({'elements': ut['set'],'delta_time_seg': ut['deltaTime'],'amount_delta_time_seg': ut['acumDeltaTime']}, ignore_index=True)
    udf.to_csv('./results/updateMetricBlazegraph.csv', index=False)
    qdf = pd.DataFrame(columns=['elements','Q1_delta_time_seg','Q2_delta_time_seg','Q3_delta_time_seg','Q4_delta_time_seg'])
    for qt in queryTime:
        qtObj = {'elements':qt['set']}
        for q in qt['queries']:
            qtObj[f"{q['queryId']}_delta_time_seg"]=q['deltaTime']
        qdf = qdf.append(qtObj, ignore_index=True)
    qdf.to_csv('./results/queriesMetricBlazegraph.csv', index=False)

Leer datos

In [48]:
updateFuseki = pd.read_csv('./results/updateMetricFuseki.csv') 
updateBlazeGraph = pd.read_csv('./results/updateMetricBlazegraph.csv') 
queriesFuseki = pd.read_csv('./results/queriesMetricFuseki.csv') 
queriesBlazegraph = pd.read_csv('./results/queriesMetricBlazegraph.csv') 

Combinar datos

In [58]:
comparativeUpdates = pd.DataFrame(
    {
        'fuseki': list(updateFuseki['delta_time_seg']), 
        'blazegraph': list(updateBlazeGraph['delta_time_seg'])
    }
)

In [59]:
comparativeUpdates

Unnamed: 0,fuseki,blazegraph
0,6.593503,3.359767
1,18.553377,17.131264
2,26.531804,43.918947
3,55.428372,165.398358
4,493.131946,2552.058824


In [62]:
rankedUpdates = comparativeUpdates.eq(comparativeUpdates.where(comparativeUpdates != 0).min(1), axis=0).astype(int)

In [42]:
comparativeQueries = pd.DataFrame(
    {
        'fuseki_Q1': list(queriesFuseki['Q1_delta_time_seg']), 
        'fuseki_Q2': list(queriesFuseki['Q2_delta_time_seg']), 
        'fuseki_Q3': list(queriesFuseki['Q3_delta_time_seg']), 
        'fuseki_Q4': list(queriesFuseki['Q4_delta_time_seg']), 
        'blazegraph_Q1': list(queriesBlazegraph['Q1_delta_time_seg']), 
        'blazegraph_Q2': list(queriesBlazegraph['Q2_delta_time_seg']), 
        'blazegraph_Q3': list(queriesBlazegraph['Q3_delta_time_seg']), 
        'blazegraph_Q4': list(queriesBlazegraph['Q4_delta_time_seg']), 
    }
)

In [86]:
comparativeQueries[['fuseki_Q3','blazegraph_Q3']]

Unnamed: 0,fuseki_Q3,blazegraph_Q3
0,1.066817,0.502146
1,0.881751,0.63966
2,1.008468,1.429181
3,0.927235,2.954163
4,1.00472,16.448291


In [84]:
rankedUpdates = pd.DataFrame()
rankedUpdates[['fuseki_Q1','blazegraph_Q1']] = comparativeQueries[['fuseki_Q1','blazegraph_Q1']].eq(comparativeQueries[['fuseki_Q1','blazegraph_Q1']].min(1), axis=0).astype(int)
rankedUpdates[['fuseki_Q2','blazegraph_Q2']] = comparativeQueries[['fuseki_Q2','blazegraph_Q2']].eq(comparativeQueries[['fuseki_Q2','blazegraph_Q2']].min(1), axis=0).astype(int)
rankedUpdates[['fuseki_Q3','blazegraph_Q3']] = comparativeQueries[['fuseki_Q3','blazegraph_Q3']].eq(comparativeQueries[['fuseki_Q3','blazegraph_Q3']].min(1), axis=0).astype(int)
rankedUpdates[['fuseki_Q4','blazegraph_Q4']] = comparativeQueries[['fuseki_Q4','blazegraph_Q4']].eq(comparativeQueries[['fuseki_Q4','blazegraph_Q4']].min(1), axis=0).astype(int)

In [128]:
comparativeQueries.iloc[0][4]

0.033353

In [112]:
comparativeQueries

Unnamed: 0,fuseki_Q1,fuseki_Q2,fuseki_Q3,fuseki_Q4,blazegraph_Q1,blazegraph_Q2,blazegraph_Q3,blazegraph_Q4
0,0.0677,15.351222,1.066817,7.800106,0.033353,0.022999,0.502146,0.028001
1,0.013002,23.949527,0.881751,7.956429,0.036598,0.021998,0.63966,0.013734
2,0.010995,30.055681,1.008468,8.378793,0.075981,0.022996,1.429181,0.027963
3,0.011002,31.07324,0.927235,8.034584,0.053015,0.032986,2.954163,0.027038
4,0.010017,36.973381,1.00472,10.692471,0.085023,0.036953,16.448291,0.05903


## Enviar datos a Firebase

Preparar los resultados

In [1]:

import json

with open('./data/measures.json') as f:
  data = json.load(f)
  tables = data['benchmarks'][1]['metrics'][0]['tables'];

  
tables[0] ={
    'name': 'Tiempo (de lectura) en BlazeGraph',
    'description': 'Tiempo medio de lectura para todas las queries testedas por média aritmética',
    'table': {
        'columns': ['#Items','#RDFs (aprox.)','Query 1','Query 2','Query 3','Query 4','Média Aritmética'],
        'rows': [
            ['50k','400k',comparativeQueries.iloc[0][4],comparativeQueries.iloc[0][5],comparativeQueries.iloc[0][6],comparativeQueries.iloc[0][7],np.mean(comparativeQueries.iloc[0][4:])],
            ['250k','2M',comparativeQueries.iloc[1][4],comparativeQueries.iloc[1][5],comparativeQueries.iloc[1][6],comparativeQueries.iloc[1][7],np.mean(comparativeQueries.iloc[0][4:])],
            ['500k','4M',comparativeQueries.iloc[2][4],comparativeQueries.iloc[2][5],comparativeQueries.iloc[2][6],comparativeQueries.iloc[2][7],np.mean(comparativeQueries.iloc[0][4:])],
            ['1M','8M',comparativeQueries.iloc[3][4],comparativeQueries.iloc[3][5],comparativeQueries.iloc[3][6],comparativeQueries.iloc[3][7],np.mean(comparativeQueries.iloc[0][4:])],
            ['5M','20M',comparativeQueries.iloc[4][4],comparativeQueries.iloc[4][5],comparativeQueries.iloc[4][6],comparativeQueries.iloc[4][7],np.mean(comparativeQueries.iloc[0][4:])]
        ]
    },
}
tables[1] ={
    'name': 'Tiempo (de lectura) en TDB2',
    'description': 'Tiempo medio de lectura para todas las queries testedas por média aritmética',
    'table': {
        'columns': ['#Items','#RDFs (aprox.)','Query 1','Query 2','Query 3','Query 4','Média Aritmética'],
        'rows': [
            ['50k','400k',comparativeQueries.iloc[0][0],comparativeQueries.iloc[0][1],comparativeQueries.iloc[0][2],comparativeQueries.iloc[0][3],np.mean(comparativeQueries.iloc[0][0:4])],
            ['250k','2M',comparativeQueries.iloc[1][0],comparativeQueries.iloc[1][1],comparativeQueries.iloc[1][2],comparativeQueries.iloc[1][3],np.mean(comparativeQueries.iloc[0][0:4])],
            ['500k','4M',comparativeQueries.iloc[2][0],comparativeQueries.iloc[2][1],comparativeQueries.iloc[2][2],comparativeQueries.iloc[2][3],np.mean(comparativeQueries.iloc[0][0:4])],
            ['1M','8M',comparativeQueries.iloc[3][0],comparativeQueries.iloc[3][1],comparativeQueries.iloc[3][2],comparativeQueries.iloc[3][3],np.mean(comparativeQueries.iloc[0][0:4])],
            ['5M','20M',comparativeQueries.iloc[4][0],comparativeQueries.iloc[4][1],comparativeQueries.iloc[4][2],comparativeQueries.iloc[4][3],np.mean(comparativeQueries.iloc[0][0:4])]
        ]
    },
}
tables[2] ={
    'name': 'Agregado de Tiempo Medio (Lectura)',
    'description': 'Tiempo medio de lectura para todos los conjuntos de datos testeados y media aritmética',
    'table': {
        'columns': ['#Items','#RDFs (aprox.)','BlazeGraph','TDB2'],
        'rows': [
            ['50k','400k',np.mean(comparativeQueries.iloc[0][4:]),np.mean(comparativeQueries.iloc[0][0:4])],
            ['250k','2M',np.mean(comparativeQueries.iloc[1][4:]),np.mean(comparativeQueries.iloc[1][0:4])],
            ['500k','4M',np.mean(comparativeQueries.iloc[2][4:]),np.mean(comparativeQueries.iloc[2][0:4])],
            ['1M','8M',np.mean(comparativeQueries.iloc[3][4:]),np.mean(comparativeQueries.iloc[3][0:4])],
            ['5M','20M',np.mean(comparativeQueries.iloc[4][4:]),np.mean(comparativeQueries.iloc[4][0:4])]
        ]
    },
}
readAggregate = pd.DataFrame.from_records( tables[2]['table']['rows'] ).iloc[:,2:4]
readAggregateRanked = readAggregate.eq(readAggregate.where(readAggregate != 0).min(1), axis=0).astype(int)
tables[3] ={
    'name': 'Tiempo (Lectura): Frecuencia de ranking y puntuacion final',
    'description': 'Frecuencia de ranking y puntuacion final para triplestores analizados, mediada por tiempo medio por media aritmetica, la puntuación total se calcula multiplicando x2 la frecuencia de 1º posición y x1 la frecuencia de 2º posición',
    'table': {
        'columns': ['#Frec','BlazeGraph','TDB2'],
        'rows': [
            ['#1',len(readAggregateRanked[(readAggregateRanked[2]==1)]),len(readAggregateRanked[(readAggregateRanked[3]==1)]) ],
            ['#2',len(readAggregateRanked[(readAggregateRanked[2]==0)]),len(readAggregateRanked[(readAggregateRanked[3]==0)]) ],
            ['Puntuación',len(readAggregateRanked[(readAggregateRanked[2]==1)])*2 + len(readAggregateRanked[(readAggregateRanked[2]==0)]),len(readAggregateRanked[(readAggregateRanked[3]==1)])*2 + len(readAggregateRanked[(readAggregateRanked[3]==0)]) ],
        ]
    },
}

tables[4] ={
    'name': 'Agregado de Tiempo Medio (Escritura)',
    'description': 'Tiempo medio de escritura para todos los conjuntos de datos testeados y media aritmética',
    'table': {
        'columns': ['#Items','#RDFs (aprox.)','BlazeGraph','TDB2'],
        'rows': [
            ['50k','400k',comparativeUpdates.iloc[:,1][0],comparativeUpdates.iloc[:,0][0]],
            ['250k','2M',comparativeUpdates.iloc[:,1][1],comparativeUpdates.iloc[:,0][1]],
            ['500k','4M',comparativeUpdates.iloc[:,1][2],comparativeUpdates.iloc[:,0][2]],
            ['1M','8M',comparativeUpdates.iloc[:,1][3],comparativeUpdates.iloc[:,0][3]],
            ['5M','20M',comparativeUpdates.iloc[:,1][4],comparativeUpdates.iloc[:,0][4]]
        ]
    },
}
updateAggregate = pd.DataFrame.from_records( tables[4]['table']['rows'] ).iloc[:,2:4]
updateAggregateRanked = readAggregate.eq(readAggregate.where(readAggregate != 0).min(1), axis=0).astype(int)
tables[3] ={
    'name': 'Tiempo (Lectura): Frecuencia de ranking y puntuacion final',
    'description': 'Frecuencia de ranking y puntuacion final para triplestores analizados, mediada por tiempo medio por media aritmetica, la puntuación total se calcula multiplicando x2 la frecuencia de 1º posición y x1 la frecuencia de 2º posición',
    'table': {
        'columns': ['#Frec','BlazeGraph','TDB2'],
        'rows': [
            ['#1',len(readAggregateRanked[(readAggregateRanked[2]==1)]),len(readAggregateRanked[(readAggregateRanked[3]==1)]) ],
            ['#2',len(readAggregateRanked[(readAggregateRanked[2]==0)]),len(readAggregateRanked[(readAggregateRanked[3]==0)]) ],
            ['Puntuación',len(readAggregateRanked[(readAggregateRanked[2]==1)])*2 + len(readAggregateRanked[(readAggregateRanked[2]==0)]),len(readAggregateRanked[(readAggregateRanked[3]==1)])*2 + len(readAggregateRanked[(readAggregateRanked[3]==0)]) ],
        ]
    },
}
#updateAggregate = pd.DataFrame.from_records( tables[5]['table']['rows'] ).iloc[:,2:4]
#updateAggregateRanked = readAggregate.eq(readAggregate.where(readAggregate != 0).min(1), axis=0).astype(int)


print(tables[5])

NameError: name 'comparativeQueries' is not defined

In [285]:
updateAggregate = pd.DataFrame.from_records( tables[4]['table']['rows'] ).iloc[:,2:4]
updateAggregateRanked = readAggregate.eq(readAggregate.where(readAggregate != 0).min(1), axis=0).astype(int)

In [282]:
updateAggregateRanked

Unnamed: 0,1,2
0,2,3
1,3,2
2,7,8


In [263]:
tables[6] ={
    'name': 'Frecuencia de ranking y puntuacion final en Lectura y escritura',
    'description': 'Suma de las Frecuencias de ranking y puntuacion final anteriores',
    'table': {
        'columns': ['#Frec','BlazeGraph','TDB2'],
        'rows': [
            ['#suma',tables[5]['table']['rows'],len(readAggregateRanked[(readAggregateRanked[3]==1)]) ],
            ['#2',len(readAggregateRanked[(readAggregateRanked[2]==0)]),len(readAggregateRanked[(readAggregateRanked[3]==0)]) ],
            ['Puntuación',len(readAggregateRanked[(readAggregateRanked[2]==1)])*2 + len(readAggregateRanked[(readAggregateRanked[2]==0)]),len(readAggregateRanked[(readAggregateRanked[3]==1)])*2 + len(readAggregateRanked[(readAggregateRanked[3]==0)]) ],
        ]
    },
}

[['50k', '400k', 3.3597669999999997, 6.593503],
 ['250k', '2M', 17.131264, 18.553376999999998],
 ['500k', '4M', 43.918946999999996, 26.531803999999998],
 ['1M', '8M', 165.398358, 55.428371999999996],
 ['5M', '20M', 2552.0588239999997, 493.13194599999997]]

In [254]:
['Puntuación',len(updateAggregateRanked[(updateAggregateRanked[2]==1)])*2 + len(updateAggregateRanked[(updateAggregateRanked[2]==0)]),len(updateAggregateRanked[(updateAggregateRanked[3]==1)])*2 + len(updateAggregateRanked[(updateAggregateRanked[3]==0)]) ]

['Puntuación', 7, 8]

In [262]:
tables[7]

IndexError: list index out of range

In [190]:
readAgregate

Unnamed: 0,2,3
0,0.146625,6.071461
1,0.177998,8.200177
2,0.38903,9.863484
3,0.766801,10.011515
4,4.157324,12.170147


In [201]:
ranking = readAgregate.eq(readAgregate.where(readAgregate != 0).min(1), axis=0).astype(int)

In [199]:
readAgregate[readAgregate.columns[0]]

0    0.146625
1    0.177998
2    0.389030
3    0.766801
4    4.157324
Name: 2, dtype: float64

In [206]:

ranking[ranking.iloc[:,0]==1].count()

2    5
3    5
dtype: int64

In [212]:
np.count(ranking.iloc[:,0] == 1)

AttributeError: module 'numpy' has no attribute 'count'

In [216]:
len(ranking[(ranking[2]==1)])

5

In [218]:
len(ranking[(ranking[2]==0)])

0