# Extend

In [2]:
import polars as pl

def get_element(data:pl.DataFrame, fields:list[str], index:int=0):
    if index == len(fields):
        return pl.DataFrame({fields[index-1]: data.to_struct()})
    
    split = fields[index]

    if split not in data:
        return pl.DataFrame({})

    new = data.select(split)
    
    if len(fields) == 1:
        return new
        
    if isinstance(new[split].dtype, pl.Struct):
        new = new.unnest(split)
    else:
        return pl.DataFrame({fields[index-1]: new.to_struct()})
        
    rec_data = get_element(new, fields, index + 1)
    
    if index == 0:
        return rec_data
    else:
        return pl.DataFrame({fields[index-1]: rec_data.to_struct()})

In [3]:
sample = [{"@timestamp": "2022-10-20T18:27:17.073Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "13"}}, {"@timestamp": "2022-10-20T18:29:03.665Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "7"}}, {"@timestamp": "2022-10-20T18:29:03.540Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "12"}}, {"@timestamp": "2022-10-20T18:29:03.158Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "7"}}, {"@timestamp": "2022-10-20T18:29:03.581Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "7"}}, {"@timestamp": "2022-10-20T18:29:03.539Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "12"}}, {"@timestamp": "2022-10-20T18:29:03.567Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "7"}}, {"@timestamp": "2022-10-20T18:29:03.361Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "12"}}, {"@timestamp": "2022-10-20T18:29:03.260Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "10"}}, {"@timestamp": "2022-10-20T18:29:03.582Z", "winlog": {"computer_name": "asarea.vxnwua.net"}, "event": {"code": "7"}}]

data = pl.from_dicts(sample)

In [4]:
df1 = data.select('event').unnest('event')
series = df1.select('code')
df2 = pl.DataFrame({'EventCode': series})

pl.concat([data, df2], how='horizontal')

@timestamp,winlog,event,EventCode
str,struct[1],struct[1],str
"""2022-10-20T18:27:17.073Z""","{""asarea.vxnwua.net""}","{""13""}","""13"""
"""2022-10-20T18:29:03.665Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""
"""2022-10-20T18:29:03.540Z""","{""asarea.vxnwua.net""}","{""12""}","""12"""
"""2022-10-20T18:29:03.158Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""
"""2022-10-20T18:29:03.581Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""
"""2022-10-20T18:29:03.539Z""","{""asarea.vxnwua.net""}","{""12""}","""12"""
"""2022-10-20T18:29:03.567Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""
"""2022-10-20T18:29:03.361Z""","{""asarea.vxnwua.net""}","{""12""}","""12"""
"""2022-10-20T18:29:03.260Z""","{""asarea.vxnwua.net""}","{""10""}","""10"""
"""2022-10-20T18:29:03.582Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""


In [5]:
def get_element_series(data:pl.DataFrame, fields:list[str], index:int=0):
    split = fields[index]
    
    if split not in data:
        raise Exception(f"Invalid field referenced {'.'.join(fields)}")
    
    new = data.select(split)
    
    if len(fields) == 1:
        return new.to_series()
    
    if isinstance(new[split].dtype, pl.Struct):
        new = new.unnest(split)
    else:
        return new.to_series()
    
    return get_element_series(new, fields, index + 1)

get_element_series(data, ['event', 'code'])

code
str
"""13"""
"""7"""
"""12"""
"""7"""
"""7"""
"""12"""
"""7"""
"""12"""
"""10"""
"""7"""


In [6]:
def build_element(name:list[str], data):
    if len(name) == 1:
        return pl.DataFrame({name[0]: data})
    
    new = build_element(name[1:], data)
    return pl.DataFrame({name[0]: new.to_struct()})

In [7]:
df1 = get_element_series(data, ['event', 'code'])
build_element(['test', 'here'], df1).to_dicts()

[{'test': {'here': '13'}},
 {'test': {'here': '7'}},
 {'test': {'here': '12'}},
 {'test': {'here': '7'}},
 {'test': {'here': '7'}},
 {'test': {'here': '12'}},
 {'test': {'here': '7'}},
 {'test': {'here': '12'}},
 {'test': {'here': '10'}},
 {'test': {'here': '7'}}]

In [8]:
def extend(data:pl.DataFrame, src:list[str], dest:list[str]):
    src_data = get_element_series(data, src)
    # Can do whatever with src_data here
    dest_data = build_element(dest, src_data)
    return pl.concat([data, dest_data], how='horizontal')

In [9]:
extend(data, ['event', 'code'], ['EventCode'])

@timestamp,winlog,event,EventCode
str,struct[1],struct[1],str
"""2022-10-20T18:27:17.073Z""","{""asarea.vxnwua.net""}","{""13""}","""13"""
"""2022-10-20T18:29:03.665Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""
"""2022-10-20T18:29:03.540Z""","{""asarea.vxnwua.net""}","{""12""}","""12"""
"""2022-10-20T18:29:03.158Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""
"""2022-10-20T18:29:03.581Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""
"""2022-10-20T18:29:03.539Z""","{""asarea.vxnwua.net""}","{""12""}","""12"""
"""2022-10-20T18:29:03.567Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""
"""2022-10-20T18:29:03.361Z""","{""asarea.vxnwua.net""}","{""12""}","""12"""
"""2022-10-20T18:29:03.260Z""","{""asarea.vxnwua.net""}","{""10""}","""10"""
"""2022-10-20T18:29:03.582Z""","{""asarea.vxnwua.net""}","{""7""}","""7"""


In [11]:
series = get_element_series(data, ['event', 'code'])
series.cast(pl.Int32)


code
i32
13
7
12
7
7
12
7
12
10
7
