In [1]:
import os
import re
import sqlite3
import csv
import pandas as pd
from datetime import datetime
from tqdm.auto import tqdm
import json

In [2]:
counterTableName = 'counter'
# to keep track of the logical time of last inserted edge i.e total number of edges

dbname = os.path.join('toy.db')
# train data DB

# graphs pre loaded to calculate regularity score
edgeTableName = 'edges'  
# graph on which regularity score calculated
evalEdgeTableName = 'edgestemp'


totalTimeWindows = 20
timeWindow = 1
totalTime = 0

# reScoresTable stores regularity score mapped with edge data {'edge': 'score')}
reScoresTable = {}

def timeWindowCalc():
    global totalTime
    global timeWindow
    global totalTimeWindows
    conn = sqlite3.connect(dbname)
    cur = conn.cursor()
    totalTime = cur.execute("""select * from {};""".format(counterTableName)).fetchone()[0]
    timeWindow = int(totalTime / totalTimeWindows)
    conn.close()

def getStability(src, edgeType, dest):
    conn = sqlite3.connect(dbname)
    cur = conn.cursor()
    
    #out(src)      #in(dest)

    #same notation as in research paper , initial count = 0
    T_from = 0      #out stable
    T_to = 0        #in stable
    T_total = 0

    for timeStart in  range(1, totalTime, timeWindow):
        timeEnd = timeStart + timeWindow - 1 

        qry = """SELECT count(*) from {}  
                where src_type = '{}' and 
                edge_type = '{}' and
                dest_type != '{}' and
                logical_time >= {} and logical_time <= {};""".format(edgeTableName, src, edgeType, dest, timeStart, timeEnd)

        countFrom = cur.execute(qry).fetchall()[0][0]

        qry = """SELECT count(*) from {}  
                where dest_type = '{}' and 
                edge_type = '{}' and
                src_type != '{}' and
                logical_time >= {} and logical_time <= {};""".format(edgeTableName, dest, edgeType, src, timeStart, timeEnd)

        countTo = cur.execute(qry).fetchall()[0][0]
        # print('cf-{} ct-{} timestart-{}'.format(countFrom, countTo, timeStart))
        if countFrom == 0:
              T_from = T_from + 1 
        if countTo == 0:
              T_to = T_to + 1
        T_total = T_total +1

    IN_dest = float(T_to) / float(T_total)
    OUT_src = float(T_from) / float(T_total)
    
    ###
    # avoiding zero value for stability
    ###
    if IN_dest == 0.0:
        IN_dest = 0.0000001
    if OUT_src == 0.0:
        OUT_src = 0.0000001
    ###
    
    conn.close()
    
    reScore = float(IN_dest * OUT_src)
    reScoresTable["{}{}{}".format(src, edgeType, dest)] = reScore
    # print("{} -- {}{}{}".format(reScore, src, edgeType, dest))


#input as src_type, edge_type, dest_type
def calculateRegularityScore(src_type, edge_type, dest_type):
    if totalTime == 0:
        print('timeWindowCalc()') 
        timeWindowCalc()
    
    if reScoresTable.get(src_type + edge_type + dest_type) == None:
        getStability(src_type, edge_type, dest_type)
    
    return reScoresTable.get(src_type + edge_type + dest_type)

In [3]:
def evaluvateGraph(tableName, graphFile):
    reScores = {}
    conn = sqlite3.connect(dbname)
    cur = conn.cursor()

    qry = """SELECT DISTINCT dest_id from {}  
            where dest_id not in
            (SELECT DISTINCT dest_id from {}  
            where dest_id not in ( SELECT DISTINCT src_id from {} ))""".format(tableName, tableName, tableName)

    nonLeafDestNodes = cur.execute(qry).fetchall()
    conn.commit()
    conn.close()
    
        
    chunk = pd.read_csv(graphFile, chunksize=10000)
    df = pd.concat(chunk)
    with tqdm(total=len(df.index)) as pbar:
        for index, row in df.iterrows():
            src_id = int(row['src_id'])
            dest_id = int(row['dest_id'])
            if reScores.get(src_id) == None:
                reScores[src_id] = 1.0


            reScore = calculateRegularityScore(row['src_type'], row['edge_type'], row['dest_type'])
            reScores[dest_id] = reScores[src_id] * reScore
            # print('{}---- {} {} {}'.format(reScore, row['src_type'], row['edge_type'], row['dest_type']))
            
            pbar.update(1)
    for d in nonLeafDestNodes:
        reScores.pop(d[0])
        
    return reScores

In [4]:
def convertTxtToCsv(fname):
    with open( 'parsed-graphs/' + fname + '.txt', 'r') as in_file:
        stripped = (line.strip() for line in in_file)
        lines = (re.split(' |:', line) for line in stripped if line)
        with open('test-graphs/' + fname + '.csv', 'w') as out_file:
            writer = csv.writer(out_file)
            writer.writerow(('src_id', 'dest_id', 'src_type', 'dest_type', 'edge_type', 'logical_time'))
            writer.writerows(lines)

def populateDB(fname):
    # Connect to SQLite database
    conn = sqlite3.connect(r'toy.db')

    # Load CSV data into Pandas DataFrame
    stud_data = pd.read_csv('test-graphs/' + fname + '.csv')
    # Write the data to a sqlite table
    stud_data.to_sql(''.join(fname.split('-')), conn, if_exists='replace')
    conn.close()
    

def test():
    fnames = []
    ### give range of test files here
    for i in range(100, 105):
        fnames.append('base-'+str(i))
    # for i in range(26, 31):
    #     fnames.append('base-'+str(i))
    # for i in range(300, 306):
    #     fnames.append('base-'+str(i))
    print(fnames)
    with tqdm(total=5) as pbar1:
        for f in fnames:
            convertTxtToCsv(f)
            populateDB(f)
            ### temp DB table name as fname
            tableName = ''.join(f.split('-'))
            reScores = evaluvateGraph(tableName , 'test-graphs/' + f + '.csv')
            with open('result/result-' + f + '.json', 'w') as fp:
                json.dump(reScores, fp)
            pbar1.update(1)
        
test()

['base-100', 'base-101', 'base-102', 'base-103', 'base-104']


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/42483 [00:00<?, ?it/s]

timeWindowCalc()


  0%|          | 0/39280 [00:00<?, ?it/s]

  0%|          | 0/34799 [00:00<?, ?it/s]

  0%|          | 0/35172 [00:00<?, ?it/s]

  0%|          | 0/32700 [00:00<?, ?it/s]