In [1]:
import sqllineage
from sqllineage.runner import LineageRunner

In [5]:
Query = """
-- PROJECTS --
CREATE OR REPLACE TABLE "out_project" AS
SELECT
    "id" AS "project_id" -- PK
    ,"key" AS "project_key"
    ,"name" AS "project_name"
    ,"project_category_name"
    ,"archived" AS "archived"
    ,PARSE_JSON("description"):SFDC_ID::VARCHAR(255) AS "sfdc_id"
    ,PARSE_JSON("description"):BUDGET::VARCHAR(255) AS "budget"
    ,PARSE_JSON("description"):PRICE::VARCHAR(255) AS "hour_rate"
FROM
    "projects"
WHERE
    "project_category_id" IN ('10001','10004') -- Project Implementation and Internal
    AND
    "project_id" NOT IN ('10028', '10114','10035','10079','10158','10189') -- not old CSAS project, not marketing project - rubbish data
    AND
    "archived" != 'True'
ORDER BY
    "project_category_id"
    ,"name"
;

-- I am not sure about that - this is account id, not order id
UPDATE "out_project"
SET "sfdc_id" = '0011t00000GslojAAB'
WHERE "project_category_name" = 'Keboola Internal'
;

UPDATE "out_project"
SET "hour_rate" = '0'
WHERE "project_category_name" = 'Keboola Internal'
;

UPDATE "out_project"
SET "budget" = '0'
WHERE "project_category_name" = 'Keboola Internal'
;

-- USERS --
CREATE OR REPLACE TABLE "out_user" AS
SELECT
    "account_id" AS "user_id"
    ,"display_name" AS "user_name"
    ,"active" AS "active"
    ,"email_address" AS "user_email"
FROM "users"
WHERE
    "account_type" != 'app'
;

-- TASKS --
CREATE OR REPLACE TABLE "out_task" AS
SELECT
    i."id" AS "task_id" -- PK
    ,i."key" AS "task_key"
    ,i."summary" AS "task_name"
    ,i."description" AS "task_description"
    ,i."issue_type_id" AS "task_type_id"
    ,i."issue_type_name" AS "task_type_name"
--    ,i."key" AS "task_key"
    ,i."time_estimate" AS "time_estimate"
    ,i."status_name"
    ,i."priority_name" AS "priority"
    ,PARSE_JSON(i."custom_fields"):customfield_10015::VARCHAR(255) AS "start_date"
    ,TRY_TO_DATE(i."due_date") AS "due_date"
--    ,TO_CHAR(TRY_TO_TIMESTAMP(i."created",'YYYY-MM-DDTHH24:MI:SS.FF3+TZHTZM'),$DEF_TIMESTAMP_FORMAT) AS "task_created"
    ,TO_CHAR(CONVERT_TIMEZONE($DEF_TIMEZONE, TRY_TO_TIMESTAMP(i."created",'YYYY-MM-DDTHH24:MI:SS.FF3+TZHTZM')),$DEF_TIMESTAMP_FORMAT) AS "task_created"
    ,p."project_name" -- linked for testing ref integrity
    ,p."project_id"
    ,i."assignee_account_id" AS "assignee_user_id" -- user_id
--    ,u."user_name" AS "assignee_user_name" -- linked for testing ref integrity, beware unassigned tasks!
    ,i."creator_account_id" AS "created_by_user_id"
    ,PARSE_JSON(i."custom_fields"):customfield_10014::VARCHAR(255) AS "epic_name_link"
    ,i."parent_id" AS "parent_id"
    -- highly manual mapping of the tasks.
    -- three layers: issue types->dedicated project name->custom fields (customer projects)
    ,CASE
        WHEN
            i."issue_type_name" = 'Epic' THEN 'epic'::VARCHAR(255)
        WHEN
            i."issue_type_name" = 'Use Case' THEN 'epic'::VARCHAR(255)
        WHEN
            i."issue_type_name" IN ('Sub-task','Subtask') THEN 'sub-task'::VARCHAR(255)
        WHEN
            i."issue_type_name" IN ('R&D - Component creation') THEN 'research_and_development'::VARCHAR(255)
        WHEN
            p."project_name" = 'Keboola Sales' THEN 'sales'::VARCHAR(255)
        WHEN
            p."project_name" = 'Keboola Component Factory' THEN 'research_and_development'::VARCHAR(255)
        WHEN
            p."project_name" = 'Keboola Internal Projects' AND i."issue_type_name" = 'Idea' THEN 'research_and_development'::VARCHAR(255)
        WHEN
            p."project_name" = 'Keboola Team' THEN 'internal'::VARCHAR(255)
        WHEN
            PARSE_JSON(i."custom_fields"):customfield_10136[0]::VARCHAR(255) IS NOT NULL THEN PARSE_JSON(i."custom_fields"):customfield_10136[0]::VARCHAR(255)
        WHEN
            PARSE_JSON(i."custom_fields"):customfield_10150[0] IS NOT NULL THEN PARSE_JSON(i."custom_fields"):customfield_10150[0]::VARCHAR(255)
        WHEN
            PARSE_JSON(i."custom_fields"):customfield_10149[0] IS NOT NULL THEN PARSE_JSON(i."custom_fields"):customfield_10149[0]::VARCHAR(255)
        WHEN
            PARSE_JSON(i."custom_fields"):customfield_10142[0] IS NOT NULL THEN PARSE_JSON(i."custom_fields"):customfield_10142[0]::VARCHAR(255)
        ELSE 
            PARSE_JSON(i."custom_fields"):customfield_10148[0]::VARCHAR(255)
        END AS "task_type"
     ,CASE
        WHEN
            PARSE_JSON(i."custom_fields"):customfield_10147[0]:value::VARCHAR(255) IS NOT NULL THEN PARSE_JSON(i."custom_fields"):customfield_10147[0]:value::VARCHAR(255)
        WHEN
            PARSE_JSON(i."custom_fields"):customfield_10134[0]:value::VARCHAR(255) IS NOT NULL THEN PARSE_JSON(i."custom_fields"):customfield_10134[0]:value::VARCHAR(255)
        WHEN
            PARSE_JSON(i."custom_fields"):customfield_10141[0]:value::VARCHAR(255) = 'Unbillable' THEN 'No'::VARCHAR(255)
        WHEN
            PARSE_JSON(i."custom_fields"):customfield_10142[0]:value::VARCHAR(255) IS NOT NULL THEN PARSE_JSON(i."custom_fields"):customfield_10142[0]:value::VARCHAR(255)
        WHEN
            p."project_name" IN ('Keboola Internal Projects','Keboola Sales','Keboola Team') THEN 'No'::VARCHAR(255)
        ELSE
            NULL --'NA'::VARCHAR(255)
        END AS "billable"
FROM
    "issues" i
JOIN
    "out_project" p ON -- FULL JOIN FOR PS projects only
    i."project_key" = p."project_key"
;

CREATE OR REPLACE TABLE "out_worklog" AS
SELECT
    w."id" AS "worklog_id" -- PK
    ,w."issue_id" as "task_id"
    ,i."task_name"
    ,w."author_account_id" AS "user_id"
    ,w."author_display_name" AS "user_name"
    ,w."comment" AS "comment"
--    ,"created" AS "worklog_created"
    ,TO_CHAR(CONVERT_TIMEZONE($DEF_TIMEZONE, TRY_TO_TIMESTAMP(w."created",'YYYY-MM-DDTHH24:MI:SS.FF3+TZHTZM')),$DEF_TIMESTAMP_FORMAT) AS "created"
    ,TO_CHAR(CONVERT_TIMEZONE($DEF_TIMEZONE, TRY_TO_TIMESTAMP(w."started",'YYYY-MM-DDTHH24:MI:SS.FF3+TZHTZM')),$DEF_TIMESTAMP_FORMAT) AS "worklog_start"
    ,TO_CHAR(DATEADD(SECOND, w."time_spent_seconds"::INT, CONVERT_TIMEZONE($DEF_TIMEZONE, TRY_TO_TIMESTAMP(w."started",'YYYY-MM-DDTHH24:MI:SS.FF3+TZHTZM'))),$DEF_TIMESTAMP_FORMAT) AS "worklog_end"
    ,TO_DATE(TRY_TO_TIMESTAMP(w."started",'YYYY-MM-DDTHH24:MI:SS.FF3+TZHTZM')) AS "worklog_date"
    ,w."time_spent_seconds" AS "duration_s"
--    ,"time_spent" AS "duration_human"
    ,i."project_name"
    ,i."project_id"
FROM "worklogs" w
LEFT JOIN "out_task" i ON -- FULL JOIN FOR PS projects only
    w."issue_id"::VARCHAR(255) = i."task_id"::VARCHAR(255)
ORDER BY "worklog_start" DESC
;

-- EPICS aka TASKLISTS --
-- Show me all "tasks" that are actually epics - they are linked by "parent_id" column
-- best guess approach
-- Using UNION here to make sure there are no duplicities
CREATE OR REPLACE TABLE "out_tasklist" AS 
(
SELECT
    t."task_id" AS "tasklist_id"
    ,t."task_id" AS "task_id"
    ,t."task_key"
    ,t."task_name"
    ,t."task_description"
    ,t."task_type_id"
    ,t."task_type_name"
    ,t."time_estimate"
    ,t."status_name"
    ,t."priority"
    ,t."start_date"
    ,t."due_date"
    ,t."task_created"
    ,t."project_name"
    ,t."project_id"
    ,t."assignee_user_id"
    ,t."created_by_user_id"
    ,t."task_type"
    ,t."billable"
FROM "out_task" t
WHERE t."task_type_name" = 'Epic'
)
UNION
(
SELECT
    DISTINCT(ot."parent_id") AS "task_list_id"
    ,ot."parent_id" AS "task_id"
    ,o."task_key"
    ,o."task_name"
    ,o."task_description"
    ,o."task_type_id"
    ,o."task_type_name"
    ,o."time_estimate"
    ,o."status_name"
    ,o."priority"
    ,o."start_date"
    ,o."due_date"
    ,o."task_created"
    ,o."project_name"
    ,o."project_id"
    ,o."assignee_user_id"
    ,o."created_by_user_id"
    ,o."task_type"
    ,o."billable"
FROM "out_task" ot
LEFT JOIN
    "out_task" o ON
    ot."parent_id" = o."task_id"
WHERE
    ot."parent_id" IS NOT NULL
    AND
    ot."parent_id" != ''
    AND
    ot."parent_id" IS NOT NULL
    AND
    o."task_id" IS NOT NULL
    AND
    ot."task_type" != 'sub-task'
);

-- fix epics (should we omit them?)
UPDATE "out_task"
SET
    "task_type" = 'epic'
WHERE "task_id" IN (SELECT DISTINCT("task_id") FROM "out_tasklist")
;

UPDATE "out_task" t
  SET t."parent_id" = tl."task_id"
FROM "out_tasklist" tl
WHERE
    t."epic_name_link" = tl."task_key"
    AND
    t."epic_name_link" IS NOT NULL AND t."parent_id" = ''
;
"""

In [6]:
result = LineageRunner(Query, verbose=True)
print(result)

Statement #1: CREATE OR REPLACE TABLE "out_project" ASSELECT    ...
    table read: [Table: <default>.projects]
    table write: [Table: <default>.out_project]
    table cte: []
    table drop: []
    table rename: []
Statement #2: UPDATE "out_project"SET "sfdc_id" = '0011t00000Gsl...
    table read: []
    table write: [Table: <default>.out_project]
    table cte: []
    table drop: []
    table rename: []
Statement #3: UPDATE "out_project"SET "hour_rate" = '0'WHERE "pr...
    table read: []
    table write: [Table: <default>.out_project]
    table cte: []
    table drop: []
    table rename: []
Statement #4: UPDATE "out_project"SET "budget" = '0'WHERE "proje...
    table read: []
    table write: [Table: <default>.out_project]
    table cte: []
    table drop: []
    table rename: []
Statement #5: CREATE OR REPLACE TABLE "out_user" ASSELECT    "ac...
    table read: [Table: <default>.users]
    table write: [Table: <default>.out_user]
    table cte: []
    table drop: []
    table re

In [5]:
result.print_column_lineage()

<default>.out_project.archived <- <default>.projects.archived
<default>.out_project.project_category_name <- <default>.projects.project_category_name
<default>.out_project.project_key <- <default>.projects.key
<default>.out_task.parse_json <- <default>.out_project.parse_json <- <default>.projects.parse_json
<default>.out_user.active <- <default>.users.active
<default>.out_user.user_email <- <default>.users.email_address
<default>.out_user.user_id <- <default>.users.account_id
<default>.out_user.user_name <- <default>.users.display_name
<default>.out_worklog.comment <- <default>.worklogs.comment
<default>.out_worklog.created <- <default>.worklogs.created
<default>.out_worklog.duration_s <- <default>.worklogs.time_spent_seconds
<default>.out_worklog.project_id <- <default>.out_task.project_id <- <default>.out_project.project_id <- <default>.projects.id
<default>.out_worklog.project_name <- <default>.out_task.project_name <- <default>.out_project.project_name <- <default>.projects.name
<d

In [7]:
for tbl in result.source_tables:
    print(tbl)

<default>.issues
<default>.projects
<default>.users
<default>.worklogs


In [8]:
for tbl in result.target_tables:
    print(tbl)

<default>.out_project
<default>.out_user
<default>.out_worklog


In [30]:
result.draw()

 * SQLLineage Running on http://localhost:5000/?e=%0ASET+DEF_TIMESTAMP_FORMAT%3D+%27YYYY-MM-DDTHH%3AMI%3ASSTZH%27%0A%3B%0A%0ASET+DEF_TIMEZONE%3D+%27Etc%2FUTC%27%0A%3B%0A%0A--+PROJECTS+--%0ACREATE+OR+REPLACE+TABLE+%22out_project%22+AS%0ASELECT%0A++++%22id%22+AS+%22project_id%22+--+PK%0A++++%2C%22key%22+AS+%22project_key%22%0A++++%2C%22name%22+AS+%22project_name%22%0A++++%2C%22project_category_name%22%0A++++%2C%22archived%22+AS+%22archived%22%0A++++%2CPARSE_JSON%28%22description%22%29%3ASFDC_ID%3A%3AVARCHAR%28255%29+AS+%22sfdc_id%22%0A++++%2CPARSE_JSON%28%22description%22%29%3ABUDGET%3A%3AVARCHAR%28255%29+AS+%22budget%22%0A++++%2CPARSE_JSON%28%22description%22%29%3APRICE%3A%3AVARCHAR%28255%29+AS+%22hour_rate%22%0AFROM%0A++++%22projects%22%0AWHERE%0A++++%22project_category_id%22+IN+%28%2710001%27%2C%2710004%27%29+--+Project+Implementation+and+Internal%0A++++AND%0A++++%22project_id%22+NOT+IN+%28%2710028%27%2C+%2710114%27%2C%2710035%27%2C%2710079%27%2C%2710158%27%2C%2710189%27%29+--+not+old

Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.
On macOS, try disabling the 'AirPlay Receiver' service from System Preferences -> Sharing.


AssertionError: 