# Rename database columns

I found it easier to explain the different columns we had in the process_uber_summary table once I grouped those columns by type.
This notebook renames columns to prepend a "type" to the column name (e.g., Read_Bytes becomes file_read_bytes, tcp_accept_count becomes network_tcp_accept_count).

It also loads the process_uber_summary into a dataframe and moves the label columns to the end. Folks who are looking to build supervised models are going to look there first.

In [None]:
import pandas as pd

In [None]:
%load_ext autoreload
%load_ext dotenv
%load_ext sql
%autoreload 1
%aimport acme4_explore

In [None]:
db = acme4_explore.connect_db()
%sql db --alias duckdb
%config SqlMagic.displaycon=False
%config SqlMagic.autopandas=True


In [None]:
sql='''
CREATE OR REPLACE VIEW standardized_process_view AS
SELECT

  -- columns_not_renamed
  pid_hash AS pid_hash,
  os_pid AS os_pid,
  os_family AS os_family,
  os AS os,
  os_version AS os_version,
  agent_id AS agent_id,
  num_agent_id AS num_agent_id,
  hostname AS hostname,
  process_name AS process_name,
  num_process_name AS num_process_name,
  args AS args,
  num_args AS num_args,
  user_name AS user_name,
  num_user_name AS num_user_name,
  parent_pid_hash AS parent_pid_hash,
  num_parent_pid_hash AS num_parent_pid_hash,
  parent_os_pid AS parent_os_pid,
  num_parent_os_pid AS num_parent_os_pid,
  process_path AS process_path,
  num_process_path AS num_process_path,
  process_stop_seconds AS process_stop_seconds,
  reg_totals AS reg_totals,
  reg_reads AS reg_reads,
  reg_writes AS reg_writes,
  reg_createkeys AS reg_createkeys,
  reg_deletekeys AS reg_deletekeys,
  reg_deletevalues AS reg_deletevalues,
  reg_first_seen AS reg_first_seen,
  reg_last_seen AS reg_last_seen,
  file_num_raw_rows AS file_num_raw_rows,
  file_first_seen AS file_first_seen,
  file_last_seen AS file_last_seen,
  dlls AS dlls,
  dll_num_uniq_files AS dll_num_uniq_files,
  dll_first_seen AS dll_first_seen,
  dll_last_seen AS dll_last_seen,
  lolbas_privs AS lolbas_privs,
  lolbas_cats AS lolbas_cats,
  lolbas_mitre AS lolbas_mitre,
  lolbas_num_rows AS lolbas_num_rows,
  mitre_analytic_ids AS mitre_analytic_ids,
  mitre_information_domains AS mitre_information_domains,
  mitre_subtypes AS mitre_subtypes,
  mitre_analytic_types AS mitre_analytic_types,
  mitre_num_rows AS mitre_num_rows,
  label_source AS label_source,
  label_num_sources AS label_num_sources,
  label_num_uniq_annotations AS label_num_uniq_annotations,
  label_num_hits AS label_num_hits,

  -- process_*
  filename                 AS process_filename,
  file_id                  AS process_file_id,
  file_md5                 AS process_file_md5,
  num_file_md5             AS process_num_file_md5,
  file_sha2                AS process_file_sha2,
  num_file_sha2            AS process_num_file_sha2,
  first_seen               AS process_first_seen,
  last_seen                AS process_last_seen,
  process_started_seconds  AS process_start_seconds,
  process_started          AS process_start,
  process_term             AS process_stop,
  read_operation_count     AS process_read_operation_count,
  write_operation_count    AS process_write_operation_count,
  read_transfer_kilobytes  AS process_read_transfer_kilobytes,
  write_transfer_kilobytes AS process_write_transfer_kilobytes,
  cpu_cycle_count          AS process_cpu_cycle_count,
  cpu_utilization          AS process_cpu_utilization,
  commit_charge            AS process_commit_charge,
  commit_peak              AS process_commit_peak,
  hard_fault_count         AS process_hard_fault_count,
  token_elevation_type     AS process_token_elevation_type,
  exit_code                AS process_exit_code,
  num_process_start        AS process_num_process_start,
  num_process_stop         AS process_num_process_stop,
  duration_seconds         AS process_duration_seconds,

  -- file_*
  Read_Bytes               AS file_read_bytes,
  Read_Events              AS file_read_events,
  Write_Bytes              AS file_write_bytes,
  Write_Events             AS file_write_events,
  Close_Events             AS file_close_events,
  Create_Events            AS file_create_events,
  Delete_Events            AS file_delete_events,
  Rename_Events            AS file_rename_events,
  SetInfo_Events           AS file_setinfo_events,

  num_uniq_file_hash       AS file_num_uniq_hashes,
  num_null_filename        AS file_num_null_filenames,

  -- network_*
  conn_id_count            AS network_conn_id_count,
  min_bytes                AS network_min_bytes,
  max_bytes                AS network_max_bytes,
  avg_bytes                AS network_avg_bytes,
  min_packets              AS network_min_packets,
  max_packets              AS network_max_packets,
  avg_packets              AS network_avg_packets,
  sq_size                  AS network_sq_size,
  tcp_accept_count         AS network_tcp_accept_count,
  tcp_connect_count        AS network_tcp_connect_count,
  tcp_disconnect_count     AS network_tcp_disconnect_count,
  tcp_reconnect_count      AS network_tcp_reconnect_count,
  tcp_recv_count           AS network_tcp_recv_count,
  tcp_recv_size            AS network_tcp_recv_size,
  tcp_retransmit_count     AS network_tcp_retransmit_count,
  tcp_send_count           AS network_tcp_send_count,
  tcp_send_size            AS network_tcp_send_size,
  tcp_tcpcopy_count        AS network_tcp_tcpcopy_count,
  tcp_tcpcopy_size         AS network_tcp_tcpcopy_size,
  udp_recv_count           AS network_udp_recv_count,
  udp_recv_size            AS network_udp_recv_size,
  udp_send_count           AS network_udp_send_count,
  udp_send_size            AS network_udp_send_size,
  tcp_rs_total             AS network_tcp_rs_total,
  udp_rs_total             AS network_udp_rs_total,
  udp_send_vs_recv         AS network_udp_send_vs_recv,
  tcp_send_vs_recv         AS network_tcp_send_vs_recv,

  -- sigma_*
  high_num_sigma_hits      AS sigma_hits_num_high,
  high_num_sigma_rows      AS sigma_rows_num_high,
  low_num_sigma_hits       AS sigma_hits_num_low,
  low_num_sigma_rows       AS sigma_rows_num_low,
  medium_num_sigma_hits    AS sigma_hits_num_medium,
  medium_num_sigma_rows    AS sigma_rows_num_medium,
  critical_num_sigma_hits  AS sigma_hits_num_critical,
  total_sigma_hits         AS sigma_hits_total,

  -- misc
  arch                     AS architecture,

  -- network summary block
  net_total_events         AS network_total_events,
  net_total_size           AS network_total_size,
  net_num_raw_rows         AS network_num_raw_rows,
  net_recv_size            AS network_recv_size,
  net_send_size            AS network_send_size,
  net_rs_total             AS network_rs_total,
  net_send_vs_recv         AS network_send_vs_recv,
  net_first_seen           AS network_first_seen,
  net_last_seen            AS network_last_seen
FROM process_uber_summary;
'''

db.sql(sql)


In [None]:
%%sql df <<
select * from standardized_process_view

In [None]:
new_column_order = [
"pid_hash",
"os_pid",
"os_family",
"os",
"os_version",
"architecture",
"agent_id",
"num_agent_id",
"hostname",
"process_name",
"num_process_name",
"args",
"num_args",
"user_name",
"num_user_name",
"parent_pid_hash",
"num_parent_pid_hash",
"parent_os_pid",
"num_parent_os_pid",
"process_path",
"num_process_path",
"process_filename",
"process_file_id",
"process_file_md5",
"process_num_file_md5",
"process_file_sha2",
"process_num_file_sha2",
"process_first_seen",
"process_last_seen",
"process_start_seconds",
"process_start",
"process_num_process_start",
"process_stop_seconds",
"process_stop",
"process_num_process_stop",
"process_duration_seconds",
"process_cpu_cycle_count",
"process_cpu_utilization",
"process_commit_charge",
"process_commit_peak",
"process_read_operation_count",
"process_write_operation_count",
"process_read_transfer_kilobytes",
"process_write_transfer_kilobytes",
"process_hard_fault_count",
"process_token_elevation_type",
"process_exit_code",
"reg_totals",
"reg_reads",
"reg_writes",
"reg_createkeys",
"reg_deletekeys",
"reg_deletevalues",
"reg_first_seen",
"reg_last_seen",
"file_close_events",
"file_create_events",
"file_delete_events",
"file_rename_events",
"file_setinfo_events",
"file_read_bytes",
"file_read_events",
"file_write_bytes",
"file_write_events",
"file_num_raw_rows",
"file_num_uniq_hashes",
"file_num_null_filenames",
"file_first_seen",
"file_last_seen",
"network_conn_id_count",
"network_total_events",
"network_total_size",
"network_num_raw_rows",
"network_tcp_accept_count",
"network_tcp_connect_count",
"network_tcp_disconnect_count",
"network_tcp_reconnect_count",
"network_tcp_recv_count",
"network_tcp_recv_size",
"network_tcp_retransmit_count",
"network_tcp_send_count",
"network_tcp_send_size",
"network_tcp_tcpcopy_count",
"network_tcp_tcpcopy_size",
"network_udp_recv_count",
"network_udp_recv_size",
"network_udp_send_count",
"network_udp_send_size",
"network_recv_size",
"network_send_size",
"network_rs_total",
"network_send_vs_recv",
"network_tcp_rs_total",
"network_tcp_send_vs_recv",
"network_udp_rs_total",
"network_udp_send_vs_recv",
"network_min_bytes",
"network_max_bytes",
"network_avg_bytes",
"network_min_packets",
"network_max_packets",
"network_avg_packets",
"network_sq_size",
"network_first_seen",
"network_last_seen",
"dlls",
"dll_num_uniq_files",
"dll_first_seen",
"dll_last_seen",
"sigma_hits_num_high",
"sigma_rows_num_high",
"sigma_hits_num_low",
"sigma_rows_num_low",
"sigma_hits_num_medium",
"sigma_rows_num_medium",
"sigma_hits_num_critical",
"sigma_hits_total",
"lolbas_privs",
"lolbas_cats",
"lolbas_mitre",
"lolbas_num_rows",
"mitre_analytic_ids",
"mitre_information_domains",
"mitre_subtypes",
"mitre_analytic_types",
"mitre_num_rows",
"label_source",
"label_num_sources",
"label_num_uniq_annotations",
"label_num_hits"
]

In [None]:
df_new = df[new_column_order]

In [None]:
df_new["label_num_hits"] = df["label_num_hits"].fillna(0)
df_new["user_name"] = df["user_name"].fillna("")

In [None]:
df_new.loc[df_new["user_name"].str.contains("baduser"), "label_num_hits"] += 1
df_new.loc[df_new["user_name"].str.contains("baduser"), "label_num_sources"] += 1
df_new.loc[df_new["user_name"].str.contains("baduser"), "label_source"] = "baduser"

In [None]:
df_new["label_num_hits"].value_counts()

In [None]:
#df_new.to_parquet("process_uber_summary_new.parquet")