# Convert sql murder mystery to cypher!

https://mystery.knightlab.com/

In [1]:
#%pip install kuzu pandas

In [2]:
import kuzu
import sqlite3
import re
import shutil

sq = sqlite3.connect("sql.db")

db_path = '/tmp/foobar2'
shutil.rmtree(db_path, ignore_errors=True)
db = kuzu.Database(db_path)
conn = kuzu.Connection(db)

In [3]:
t_raw = sq.execute("select tbl_name, sql from sqlite_schema where type = 'table'").fetchall()
print(t_raw[0])

('crime_scene_report', 'CREATE TABLE crime_scene_report (\n        date integer,\n        type text,\n        description text,\n        city text\n    )')


In [4]:
def sql_to_schema(tname, sql):
    sql = sql.replace("\n","")
    rest = sql.split("(",1)[1].replace("("," ").replace(")"," ")
    rest = re.sub(r'\s+', ' ', rest)
    tbl = {'name': tname, 'fields': [], 'primary_key': None, 'foreign_key': None}
    for item in rest.split(","):
        f = item.strip().split()
        name = f[0]
        if name == "FOREIGN":
            fk_f = f[2]
            fk_t = f[4]
            fk_r = f[5]
            tbl['foreign_key'] = (fk_f, fk_t, fk_r)
            continue
        if 'PRIMARY' in f:
            tbl['primary_key'] = name
        tbl['fields'].append((f[0], f[1]))
    return tbl
    
schema = [sql_to_schema(tname, sql) for tname,sql in t_raw]
#schema

In [5]:

def cleanup(t):
    t = t.lower()
    if t in ['varchar','text','char']:
        return 'string'
    if t in ['bigint','integer']:
        return 'int64'
    return t
    
gschema = []
for ot in schema:
    t = ot.copy()
    if t['primary_key'] is None:
        t['fields'] = [('id','serial')] + t['fields']
        t['primary_key'] = 'id'
    t['fields'] = [(n,cleanup(t)) for n,t in t['fields']]
    gschema.append(t)

for t in gschema:
    fieldstr = ",".join([f"{n} {t}" for n,t in t['fields']] + [f"primary key ({t['primary_key']})"])
    cypher = f"create node table {t['name']} ({fieldstr});"
    conn.execute(cypher)


In [6]:
tuple([2,23])

(2, 23)

In [7]:
### COPY IN NODE DATA
import csv

def clean_cols(t):
    return tuple([c.replace("\n"," ").replace('"',"'") if type(c) == str else c for c in t])
    
for st,gt in zip(schema,gschema):
    with open("tmp.csv","w") as f:
        w = csv.writer(f)
        cols = [cn for cn,_ in st['fields']]
        w.writerow(cols)
        rows = sq.execute(f"select {','.join(cols)} from {st['name']}").fetchall()
        for row in rows:
            clean_row = clean_cols(row)
            w.writerow(clean_row)
    conn.execute(f"copy {gt['name']} from 'tmp.csv' (header=true, parallel=false)")
        

In [8]:
conn.execute("match (i:income) return *").get_as_df()

Unnamed: 0,i
0,"{'_id': {'offset': 4096, 'table': 7}, '_label'..."
1,"{'_id': {'offset': 4097, 'table': 7}, '_label'..."
2,"{'_id': {'offset': 4098, 'table': 7}, '_label'..."
3,"{'_id': {'offset': 4099, 'table': 7}, '_label'..."
4,"{'_id': {'offset': 4100, 'table': 7}, '_label'..."
...,...
7506,"{'_id': {'offset': 7506, 'table': 7}, '_label'..."
7507,"{'_id': {'offset': 7507, 'table': 7}, '_label'..."
7508,"{'_id': {'offset': 7508, 'table': 7}, '_label'..."
7509,"{'_id': {'offset': 7509, 'table': 7}, '_label'..."


In [9]:
cy = """
create rel table has_license (from person to drivers_license);
create rel table has_income (from person to income);
create rel table has_interview (from person to interview);
create rel table has_checkin (from get_fit_now_member to get_fit_now_check_in);
create rel table went_to_event (from person to facebook_event_checkin);
create rel table is_member (from person to get_fit_now_member);
"""
conn.execute(cy)

[<kuzu.query_result.QueryResult at 0x1077fe810>,
 <kuzu.query_result.QueryResult at 0x108af0450>,
 <kuzu.query_result.QueryResult at 0x1077befd0>,
 <kuzu.query_result.QueryResult at 0x129b61fd0>,
 <kuzu.query_result.QueryResult at 0x129b62290>,
 <kuzu.query_result.QueryResult at 0x129b62fd0>]

In [10]:
cy = """
match (p:person), (d:drivers_license) where p.license_id = d.id
create (p)-[:has_license]->(d);
match (p:person), (i:income) where p.ssn = i.ssn
create (p)-[:has_income]->(i);
match (p:person), (i:interview) where p.id = i.person_id
create (p)-[:has_interview]->(i);
match (p:person), (m:get_fit_now_member) where p.id = m.person_id
create (p)-[:is_member]->(m);
match (p:person), (e:facebook_event_checkin) where p.id = e.person_id
create (p)-[:went_to_event]->(e);
match (m:get_fit_now_member), (c:get_fit_now_check_in) where c.membership_id = m.id
create (m)-[:has_checkin]->(c);
"""
conn.execute(cy)

[<kuzu.query_result.QueryResult at 0x129b74150>,
 <kuzu.query_result.QueryResult at 0x129b741d0>,
 <kuzu.query_result.QueryResult at 0x129b74290>,
 <kuzu.query_result.QueryResult at 0x129b74310>,
 <kuzu.query_result.QueryResult at 0x129b74390>,
 <kuzu.query_result.QueryResult at 0x129b74410>]

In [11]:
def cyph(s):
    return conn.execute(s).get_as_df()


In [14]:
cyph("""match (i:income) return *""")

Unnamed: 0,i
0,"{'_id': {'offset': 6144, 'table': 7}, '_label'..."
1,"{'_id': {'offset': 6145, 'table': 7}, '_label'..."
2,"{'_id': {'offset': 6146, 'table': 7}, '_label'..."
3,"{'_id': {'offset': 6147, 'table': 7}, '_label'..."
4,"{'_id': {'offset': 6148, 'table': 7}, '_label'..."
...,...
7506,"{'_id': {'offset': 2043, 'table': 7}, '_label'..."
7507,"{'_id': {'offset': 2044, 'table': 7}, '_label'..."
7508,"{'_id': {'offset': 2045, 'table': 7}, '_label'..."
7509,"{'_id': {'offset': 2046, 'table': 7}, '_label'..."
