# Getting started using parquet via DuckDB with Wintap

## Workflow:
Parquet files -> DuckDB Tables/Views -> SQL EDA/Extraction -> Pandas -> Resume typical workflow

The motivation for introducing DuckDB to the workflow for initial EDA and extraction of subsets is to allow for working with datasets larger than memory. Once the subset of interest is identified using SQL, the result can be extracted easily into pandas.

### Map parquet into DuckDB
* Initialize an in-memory database with views for all event types at an aggregation level.
    * Note that views are basically pointers to the parquet files and use no memory.
* Present a summary of current dataset
    * Tabular view with row counts and parquet file sizes

In [1]:
SUMMARY_INTERVAL='12 hours'
MIN_DAYPK=20230410
MAX_DAYPK=20230415
SIMPLE={
    'PID_HASH':'C9C6E1C87C692A13CB60AB380E069FD6',
    'DAYPK':20230519
}


In [2]:
# Define imports, functions
# This dataset_chooser() uses a .env file in the top level of this project. It needs to define DATAPATH as the top level of where your data sets are.
# You can optionally define a DEFAULT_PATH pointing to a specific dataset. This provides the convenience of not having to select the dataset when restarting the notebook.
# See .env-default for an example.
# If there is no .env or the paths are invalid, dataset_chooser() defaults to users home directory.

# To enable logging output to jupyter, uncomment the following 3 lines:
#import logging
#logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)
%run notebookutil.py

w_datasets=dataset_chooser()
display(w_datasets)

FileChooser(path='/scratch/fusioncuisine/wintapv6/llnl_herd', filename='', title='<b>Select Wintap Dataset Pat…

In [3]:
# Initialize an in-memory db. Save reference in a variable and then set magic-duckdb environment. Result is ability to use the same DB instance from python code and %dql/%%dql magics.
# Also create views for every top-level type found in the current dataset.
con=ru.init_db(w_datasets.selected) # ,agg_level='rolling')
%dql -co con
# Display the list of tables/views
%dql show tables

Unnamed: 0,name
0,all_files
1,files
2,host
3,host_ip
4,process
5,process_conn_incr
6,process_exe_file_summary
7,process_file
8,process_image_load
9,process_net_conn


### Summarize event data and display in chart to help understand event distribution over time

In [4]:
# Tabular summary
display(svd.table_summary(con,w_datasets.selected))

Unnamed: 0,Table_Name,Min_DayPK,Max_DayPK,Num_Rows,Size,Files
0,all_files,20230216,20230308,653528,29.04 MB,4.0
1,files,20230216,20230308,698436,62.36 MB,4.0
2,host,20230101,20230620,1887,810.67 KB,167.0
3,host_ip,20230101,20230620,2350,407.27 KB,168.0
4,process,20230101,20230620,371118913,34.8 GB,162.0
5,process_conn_incr,20230101,20230619,306533004,27.72 GB,166.0
6,process_exe_file_summary,20230101,20230620,658902,36.56 MB,162.0
7,process_file,20230216,20230619,87141486,10.95 GB,121.0
8,process_image_load,20230101,20230308,516918939,30.57 GB,39.0
9,process_net_conn,20230101,20230619,127224661,11.05 GB,166.0


In [5]:
# Events over time. 
# To do: Dynamically adjust the bucket size based on the dataset duration for the best resolution/performance.
svd.init_db(con,SUMMARY_INTERVAL)
eventdf=svd.fetch_summary_data(con)
svd.display_event_chart(eventdf)

### EDA
* Summarize: display table schema and some statistics about its contents
* Head: list a small set of rows
* Group By: aggregate on 1-N columns
* Time partitions: Filter or Group By Days using DayPK
* Joining tables
    * Within a single day: All systems go...
    * Over multiple days: PROCESS and HOST both need to be deduped
* Specific events: highlight events of interest (puttyx/notepad++/etc)

In [6]:
# Summarize process to get a high level view of the columns and values
# Create a file with sample values per dataset.
%dql -j summarize SELECT * FROM process where daypk BETWEEN {{MIN_DAYPK}} AND {{MAX_DAYPK}}

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,pid_hash,VARCHAR,0000021CBC8D0F8370667A11C673E91B,FFFFFEE87A666F412E3FCB3294B70FB9,6945526,,,,,,21525925,0.0%
1,os_family,VARCHAR,windows,windows,1,,,,,,21525925,0.0%
2,hostname,VARCHAR,BHANGAL1-PCL2,WL-9508338,17,,,,,,21525925,0.0%
3,os_pid,INTEGER,0,625660,150995,118875.35353263564,141092.0986754485,16149.0,46812.0,177099.0,21525925,0.0%
4,process_name,VARCHAR,/powershell.exe,zeroconfigservice.exe,1249,,,,,,21525925,0.0%
5,num_process_name,BIGINT,0,1,2,0.9999981417755568,0.0013631658287395,1.0,1.0,1.0,21525925,0.0%
6,args,VARCHAR,"""--cd=c:\users\bhangal1\documents\projects\tfs\ears\ears\ears.agent\bin\debug.""",~ -d ubuntu-18.04,572714,,,,,,21525925,47.02%
7,num_args,BIGINT,0,1,2,0.5320712582618401,0.4989703858543352,0.0,1.0,1.0,21525925,0.0%
8,user_name,VARCHAR,-,system,75,,,,,,21525925,63.67%
9,num_user_name,BIGINT,1,1,1,1.0,0.0,1.0,1.0,1.0,21525925,0.0%


In [7]:
# Get all columns for the first 10 rows
%dql select * from process limit 10

Unnamed: 0,pid_hash,os_family,hostname,os_pid,process_name,num_process_name,args,num_args,user_name,num_user_name,parent_pid_hash,num_parent_pid_hash,parent_os_pid,num_parent_os_pid,process_path,num_process_path,filename,file_id,file_md5,num_file_md5,file_sha2,num_file_sha2,process_started_seconds,process_started,first_seen,last_seen,num_start_events,process_term_seconds,process_term,cpu_cycle_count,cpu_utilization,commit_charge,commit_peak,read_operation_count,write_operation_count,read_transfer_kilobytes,write_transfer_kilobytes,hard_fault_count,token_elevation_type,exit_code,num_process_stop,dayPK
0,2D2743A8D19775522EA942DACB1C5E33,windows,ZORK,6536,microsoftedgeupdate.exe,1,,0,ZORK$,1,9487505CA45DBC2643CBE1A8977938AE,1,1352,1,c:\program files (x86)\microsoft\edgeupdate\microsoftedgeupdate.exe,1,c:\program files (x86)\microsoft\edgeupdate\microsoftedgeupdate.exe,512b7d1f22b068a3f68cd145bd59fe3f,F5801470145FE1B446E98E7709311271,1,F5801470145FE1B446E98E7709311271,1,1671807000.0,2022-12-23 14:48:34.525236,2022-12-23 14:48:34.525236,2022-12-23 14:48:34.525236,20,NaT,NaT,,,,,,,,,,,,,20230101
1,E90149BA60904792DB6B7294B8E1D2B9,windows,ZORK,5820,splunk-powershell.exe,1,,0,ZORK$,1,28C79E4933BD4C8C6A5C8E967044803B,1,3768,1,c:\program files\splunkuniversalforwarder\bin\splunk-powershell.exe,1,c:\program files\splunkuniversalforwarder\bin\splunk-powershell.exe,d1a9a47d81b44e6c3d71436fef994101,512BC840EEC1DE55DC801F64718908F6,1,512BC840EEC1DE55DC801F64718908F6,1,1671815000.0,2022-12-23 16:59:27.971467,2022-12-23 16:59:27.971467,2022-12-23 16:59:27.971467,20,NaT,NaT,,,,,,,,,,,,,20230101
2,6CD54DB61F7D74ACDD17FB4F2172FE1F,windows,ZORK,4752,mergehelper.exe,1,,0,ZORK$,1,E34C9B2C6729167B109CB8EFDBCA4214,1,940,1,c:\program files\wintap\mergertool\mergehelper.exe,1,c:\program files\wintap\mergertool\mergehelper.exe,4c52aa4676862dcfea4624cfa510e067,0CB96E1A5550CD283108066E072EB0D9,1,0CB96E1A5550CD283108066E072EB0D9,1,1671817000.0,2022-12-23 17:28:43.409158,2022-12-23 17:28:43.409158,2022-12-23 17:28:43.409158,20,NaT,NaT,,,,,,,,,,,,,20230101
3,EDEA7EE2DF2BC1A519557ADF76BD5531,windows,ZORK,2300,conhost.exe,1,,0,ZORK$,1,4C5B8E911BEC117123B850408F7E9C76,1,4316,1,c:\windows\system32\conhost.exe,1,c:\windows\system32\conhost.exe,651e1bbcd8f0b3679ea7800a529a82db,0D698AF330FD17BEE3BF90011D49251D,1,0D698AF330FD17BEE3BF90011D49251D,1,1671822000.0,2022-12-23 18:55:22.694343,2022-12-23 18:55:22.694343,2022-12-23 18:55:22.694343,19,NaT,NaT,,,,,,,,,,,,,20230101
4,1B66E7C4921DF5254571CB9DC66A57C2,windows,ZORK,7388,splunk-powershell.exe,1,,0,ZORK$,1,28C79E4933BD4C8C6A5C8E967044803B,1,3768,1,c:\program files\splunkuniversalforwarder\bin\splunk-powershell.exe,1,c:\program files\splunkuniversalforwarder\bin\splunk-powershell.exe,d1a9a47d81b44e6c3d71436fef994101,512BC840EEC1DE55DC801F64718908F6,1,512BC840EEC1DE55DC801F64718908F6,1,1671824000.0,2022-12-23 19:40:30.586427,2022-12-23 19:40:30.586427,2022-12-23 19:40:30.586427,19,NaT,NaT,,,,,,,,,,,,,20230101
5,D32B4C4332675FF1C047D7A5609252E7,windows,ZORK,5612,splunk-regmon.exe,1,,0,ZORK$,1,28C79E4933BD4C8C6A5C8E967044803B,1,3768,1,c:\program files\splunkuniversalforwarder\bin\splunk-regmon.exe,1,c:\program files\splunkuniversalforwarder\bin\splunk-regmon.exe,11caacc588975425bd50a42cbc52ac5d,EBF8793E3AB498ED6F99205276A82EE6,1,EBF8793E3AB498ED6F99205276A82EE6,1,1671820000.0,2022-12-23 18:20:30.784559,2022-12-23 18:20:30.784559,2022-12-23 18:20:30.784559,20,NaT,NaT,,,,,,,,,,,,,20230101
6,BD4C07734A8C5F1B4F66D5071FF91A3A,windows,ZORK,7328,splunk-admon.exe,1,,0,ZORK$,1,28C79E4933BD4C8C6A5C8E967044803B,1,3768,1,c:\program files\splunkuniversalforwarder\bin\splunk-admon.exe,1,c:\program files\splunkuniversalforwarder\bin\splunk-admon.exe,1f018e75cd2d74bb76a1e1dc11e20e70,D95FA05ED75F62D4A0EF9B5155AB37DE,1,D95FA05ED75F62D4A0EF9B5155AB37DE,1,1671817000.0,2022-12-23 17:42:26.703151,2022-12-23 17:42:26.703151,2022-12-23 17:42:26.703151,20,NaT,NaT,,,,,,,,,,,,,20230101
7,A620AF4CEC3067BC0E499FB9D79C3937,windows,ZORK,3020,splunk-netmon.exe,1,,0,ZORK$,1,28C79E4933BD4C8C6A5C8E967044803B,1,3768,1,c:\program files\splunkuniversalforwarder\bin\splunk-netmon.exe,1,c:\program files\splunkuniversalforwarder\bin\splunk-netmon.exe,51e8dd11d4cf2962df05310054e0ab90,D4732A279D7D42A1C7BB17308D8B74E8,1,D4732A279D7D42A1C7BB17308D8B74E8,1,1671821000.0,2022-12-23 18:47:26.421133,2022-12-23 18:47:26.421133,2022-12-23 18:47:26.421133,20,NaT,NaT,,,,,,,,,,,,,20230101
8,B0D35CFAE6451AF6A58F492702E6B10F,windows,ZORK,3192,splunk-powershell.exe,1,,0,ZORK$,1,28C79E4933BD4C8C6A5C8E967044803B,1,3768,1,c:\program files\splunkuniversalforwarder\bin\splunk-powershell.exe,1,c:\program files\splunkuniversalforwarder\bin\splunk-powershell.exe,d1a9a47d81b44e6c3d71436fef994101,512BC840EEC1DE55DC801F64718908F6,1,512BC840EEC1DE55DC801F64718908F6,1,1671822000.0,2022-12-23 19:01:31.069274,2022-12-23 19:01:31.069274,2022-12-23 19:01:31.069274,19,NaT,NaT,,,,,,,,,,,,,20230101
9,0067B2446711E3D728668A1FADB9C668,windows,ZORK,4504,splunk-powershell.exe,1,,0,ZORK$,1,28C79E4933BD4C8C6A5C8E967044803B,1,3768,1,c:\program files\splunkuniversalforwarder\bin\splunk-powershell.exe,1,c:\program files\splunkuniversalforwarder\bin\splunk-powershell.exe,d1a9a47d81b44e6c3d71436fef994101,512BC840EEC1DE55DC801F64718908F6,1,512BC840EEC1DE55DC801F64718908F6,1,1671831000.0,2022-12-23 21:32:32.196325,2022-12-23 21:32:32.196325,2022-12-23 21:32:32.196325,19,NaT,NaT,,,,,,,,,,,,,20230101


In [8]:
%%dql -j
-- Use GROUP BY to find the most and least common process_name. Jupyter helps out by displaying the first and last sets of rows.
-- Calculate a counts for some common fields also.
-- To keep it fast for demos, limit to a subset of DayPKs. Try commenting out the WHERE clause to see results over all the data.
-- Note: the cell magic (%%dql) treats the entire cell as SQL, so python (#) comments do not work 
SELECT process_name, count(distinct hostname) num_hostname, count(distinct file_md5) num_file_md5, count(distinct user_name) num_user_name, count(distinct pid_hash), count(*) num_rows
FROM process
WHERE daypk BETWEEN {{MIN_DAYPK}} AND {{MAX_DAYPK}}
GROUP BY ALL
ORDER BY num_rows

Unnamed: 0,process_name,num_hostname,num_file_md5,num_user_name,count(DISTINCT pid_hash),num_rows
0,msife74.tmp.exe,1,1,0,1,1
1,msif1c9.tmp.exe,1,1,0,1,1
2,nsf5a9d.tmp.exe,1,1,0,1,1
3,applemobiledevicehelper.exe,1,1,1,1,1
4,vscodeusersetup-x64-1.77.3.exe,1,1,1,1,1
...,...,...,...,...,...,...
1221,cmd.exe,17,2,32,395511,1114265
1222,ykman.exe,14,1,14,530885,1743218
1223,mergehelper.exe,16,1,5,555304,1847622
1224,splunk-powershell.exe,16,1,9,614606,1960737


In [9]:
# Simple count of processes per day, with result assigned to a panda
# Convert dayPK to a timestamp and altair then displays it nicely.
processes_per_day = %dql select strptime(dayPK,'%Y%m%d') dayPK, count(*) num_rows from process group by all order by daypk
# Chart that using Altair
chart = alt.Chart(processes_per_day).mark_line().encode(
        x='dayPK:T',
        y='num_rows',
        tooltip=['dayPK:T','num_rows']
    ).properties(
        width=1200,
        height=400,
        title='Processes Per Day'
    ).interactive()
display(chart)


In [None]:
%dql select pid_hash, first(process_name), first(daypk) daypk, count(*) from process where process_name = 'putty.exe' group by pid_hash order by daypk

In [10]:
# Display a single process and its network connections
# Adding the daypk filter reduces the search space to just the single day rather than ~180 that are in the set.
proc = %dql -j select * from process where pid_hash='{{SIMPLE.PID_HASH}}' and daypk={{SIMPLE.DAYPK}}
net = %dql -j select * from process_conn_incr where pid_hash='{{SIMPLE.PID_HASH}}' order by first_seen
display(proc)
display(net)

Unnamed: 0,pid_hash,os_family,hostname,os_pid,process_name,num_process_name,args,num_args,user_name,num_user_name,parent_pid_hash,num_parent_pid_hash,parent_os_pid,num_parent_os_pid,process_path,num_process_path,filename,file_id,file_md5,num_file_md5,file_sha2,num_file_sha2,process_started_seconds,process_started,first_seen,last_seen,num_start_events,process_term_seconds,process_term,cpu_cycle_count,cpu_utilization,commit_charge,commit_peak,read_operation_count,write_operation_count,read_transfer_kilobytes,write_transfer_kilobytes,hard_fault_count,token_elevation_type,exit_code,num_process_stop,dayPK
0,C9C6E1C87C692A13CB60AB380E069FD6,windows,WL-9508338,6584,putty.exe,1,,0,THE-LAB\bielejeski1,1,C35E2075ADF5DEA092B18B859E3C7AD5,1,17020,1,c:\users\bielejeski1\downloads\tools\putty.exe,1,c:\users\bielejeski1\downloads\tools\putty.exe,c928f4029c4db9de7b92595e265bcf89,E32F72E15F78347C51C4CA1B2847F667,1,341CB4515476007153B7F17212F5E4476852837A031EFEDD5A4ADEA723C0BCBE,1,1684505000.0,2023-05-19 13:57:34.981389,2023-05-19 13:57:34.981389,2023-05-19 13:57:34.981389,6,NaT,NaT,,,,,,,,,,,,,20230519


Unnamed: 0,os_family,hostname,pid_hash,conn_id,protocol,incr_start,local_ip_addr,local_ip_int,local_port,local_pg,remote_ip_addr,remote_ip_int,remote_port,remote_pg,total_events,total_size,num_raw_rows,tcp_accept_count,tcp_connect_count,tcp_disconnect_count,tcp_reconnect_count,tcp_recv_count,tcp_recv_size,tcp_retransmit_count,tcp_send_count,tcp_send_size,tcp_tcpcopy_count,tcp_tcpcopy_size,udp_recv_count,udp_recv_size,udp_send_count,udp_send_size,min_10sec_eventcount,max_10sec_eventcount,min_size,max_size,sq_size,max_tcp_recv_count,min_tcp_recv_size,max_tcp_recv_size,sq_tcp_recv_size,max_tcp_send_count,min_tcp_send_size,max_tcp_send_size,sq_tcp_send_size,max_udp_recv_count,min_udp_recv_size,max_udp_recv_size,sq_udp_recv_size,max_udp_send_count,min_udp_send_size,max_udp_send_size,sq_udp_send_size,first_seen,last_seen,dayPK
0,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 13:57:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,33.0,9446.0,7,,1.0,,,12.0,3629.0,,8.0,2188.0,12.0,3629.0,,,,,1,10,0,1376,6716418.0,10,21,992,2344441.0,7.0,28.0,1376.0,2027536.0,,,,,,,,,2023-05-19 13:57:29.477476,2023-05-19 13:57:39.815935,20230519
1,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 14:02:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,3.0,208.0,3,,,,,1.0,80.0,,1.0,48.0,1.0,80.0,,,,,1,1,48,80,15104.0,1,80,80,6400.0,1.0,48.0,48.0,2304.0,,,,,,,,,2023-05-19 14:02:39.150011,2023-05-19 14:02:39.150011,20230519
2,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 14:03:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,59.0,9744.0,9,,,,,21.0,4320.0,,17.0,1104.0,21.0,4320.0,,,,,1,16,64,1240,6074880.0,16,64,1240,3001472.0,14.0,64.0,80.0,71936.0,,,,,,,,,2023-05-19 14:03:41.944921,2023-05-19 14:03:55.792858,20230519
3,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 14:08:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,84.0,13600.0,3,,,,,30.0,6032.0,,24.0,1536.0,30.0,6032.0,,,,,24,30,64,1240,10027008.0,30,64,1240,4964352.0,24.0,64.0,64.0,98304.0,,,,,,,,,2023-05-19 14:08:52.326608,2023-05-19 14:08:59.337779,20230519
4,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 14:13:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,3.0,208.0,3,,,,,1.0,80.0,,1.0,48.0,1.0,80.0,,,,,1,1,48,80,15104.0,1,80,80,6400.0,1.0,48.0,48.0,2304.0,,,,,,,,,2023-05-19 14:13:59.074792,2023-05-19 14:13:59.074792,20230519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 15:46:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,52.0,7488.0,12,,,,,26.0,3744.0,,,,26.0,3744.0,,,,,4,6,128,160,1091584.0,6,128,160,545792.0,,,,,,,,,,,,,2023-05-19 15:46:00.947498,2023-05-19 15:46:58.355163,20230519
70,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 15:47:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,60.0,8640.0,12,,,,,30.0,4320.0,,,,30.0,4320.0,,,,,4,7,128,160,1259520.0,7,128,160,629760.0,,,,,,,,,,,,,2023-05-19 15:47:02.367393,2023-05-19 15:48:02.707029,20230519
71,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 15:48:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,329.0,71488.0,17,,,,,124.0,32976.0,,81.0,5536.0,124.0,32976.0,,,,,2,56,48,1240,57630720.0,56,48,1240,28611072.0,39.0,64.0,160.0,408576.0,,,,,,,,,2023-05-19 15:48:10.776094,2023-05-19 15:49:00.049747,20230519
72,windows,WL-9508338,C9C6E1C87C692A13CB60AB380E069FD6,9F444F7845A54502C0F3BC760EBE5566,TCP,2023-05-19 15:49:00,128.15.170.25,2148510233,53213,,128.15.144.82,2148503634,22,,490.0,107744.0,18,,,,,171.0,49008.0,,148.0,9728.0,171.0,49008.0,,,,,2,68,48,1240,71786496.0,68,48,1240,35556352.0,55.0,64.0,224.0,673792.0,,,,,,,,,2023-05-19 15:49:06.164325,2023-05-19 15:50:00.450794,20230519


### Extraction

In [15]:
globs=ru.get_glob_paths_for_dataset('/scratch/fusioncuisine/wintapv6/llnl_herd/','rolling')
stmts=ru.generate_view_sql(globs)
for sql in stmts:
    print(sql.strip()+';')


create or replace view all_files as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/llnl_herd/rolling/all_files/*/*.parquet',hive_partitioning=1);
create or replace view files as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/llnl_herd/rolling/files/*/*.parquet',hive_partitioning=1);
create or replace view host as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/llnl_herd/rolling/host/*/*.parquet',hive_partitioning=1);
create or replace view host_ip as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/llnl_herd/rolling/host_ip/*/*.parquet',hive_partitioning=1);
create or replace view process as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/llnl_herd/rolling/process/*/*.parquet',hive_partitioning=1);
create or replace view process_conn_incr as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/llnl_herd/rolling/process_conn_incr/*/*.parquet',hive_partitioning=1);
create o

In [24]:
globs=ru.get_glob_paths_for_dataset('/scratch/fusioncuisine/wintapv6/llnl_herd/','rolling')
con.execute("attach 'rolling_views.db' ")
con.execute('use rolling_views')
ru.create_raw_views(con, globs)
con.execute('use memory')
con.execute('detach rolling_views')

ParserException: Parser Error: syntax error at or near "rolling_views"
LINE 1: attach rolling_views.db
               ^