In [110]:
import os
os.listdir("../lyft_stock")

['investing.txt',
 'marketwatch.txt',
 'marketwatch_parse.txt',
 'motleyfool.txt',
 'schaeffers.txt']

In [136]:
with open('../lyft_stock/marketwatch_parse.txt') as f:
    lines = [l.strip() for l in f.readlines() if len(l.strip()) > 0]
len(lines)

98

In [137]:
assert "Entities:" in lines, "Error: did not find 'Entities:' header"
assert "Interactions:" in lines, "Error: did not find 'Interactions:' header"
idx = lines.index("Interactions:")
entity_lines = lines[1:idx]
interaction_lines = lines[idx + 1:]
# print(interaction_lines)

In [138]:
def parse_entities(entity_lines):
    ent_map = dict()
    for line in entity_lines:
        if '[' in line:
            ent, contents = line.split('[')
            contents = [c.strip() for c in contents[:-1].split(',')]
        else:
            ent, contents = line, []
        assert ent.strip() not in ent_map.keys(), f"Error: entity {ent} already exists in entity map"
        ent_map[ent.strip()] = list(set(contents))
    return ent_map
entities = parse_entities(entity_lines)
print(entities)

{'pandemic,': ['pandemic', 'COVID-19 pandemic'], 'uber,': ['company', 'Uber', 'companies', 'Uber Technologies inc.'], 'lyft,': ['Lyft', 'Lyft Int.', 'rival Lyft', 'companies'], 'consistent lane,': [''], 'earnings preview,': ['fourth quarter results'], 'ride-hailing,': ['ride-hailing environment', 'ride-hailing', 'ride-hailing industry'], 'fourth quarter,': ['fourth quarter'], 'omicron surge,': ['COVID-19 omicron surge'], 'two years': [], 'stability': [], 'Analyst data': [], 'rides': [], 'travel': [], 'US ride-hailing volume': [], 'YipitData analysis': [], 'Raymond James & Associates': ['', 'Raymond James analysts'], "Uber's fourth quarter bookings": [], "Lyft's daily active users": [], 'driver supply': [], 'New Year': [], 'incentives': ['incentives'], 'status of food delivery': [], "Uber's Eats division": ['business', 'Eats'], 'analysts': ['analysts'], 'FactSet': ['FactSet'], 'Estimize': ['Estimize'], 'hedge-fund managers': [], 'executives': [], 'Uber stock': ['Uber shares'], 'Lyft sto

In [141]:
def split_attributes(chunk):
    # chunk = chunk.strip()
    if "(" not in chunk:
        return chunk, None
    # print(chunk)
    ent, attrs = chunk.split('(')
    ent = ent.strip()
    attrs = [attr.strip() for attr in attrs[:-1].split(',')]
    return ent.strip(), attrs

def recombine_chunks(ent_list):
    # For {} chunks, recombine them
    open_idxs = []
    close_idxs = []
    for i, ent in enumerate(ent_list):
        if '{' in ent:
            open_idxs.append(i)
        if '}' in ent:
            close_idxs.append(i)
            
    assert len(open_idxs) == len(close_idxs), "Mismatch brackets in line: " + str(ent_list)
    for i, open_idx in enumerate(open_idxs):
        # If open and close are in same chunk, no need to merge
        if open_idx == close_idxs[i]:
            continue
        ent_list = ent_list[:open_idx] + ["".join(ent_list[open_idx:close_idxs[i] + 1])] + ent_list[close_idxs[i] + 1:]
        # print("list:", [ent.strip() for ent in ent_list])
    return [ent.strip() for ent in ent_list]

def parse_SVO(line):
    edges = []
    # Get subj ents
    # TODO: Handle case: {status of food delivery [changing]} [affects] Uber
    subj_ents, rest = [l.strip() for l in line.split('[', maxsplit=1)]
    subj_ents = recombine_chunks(subj_ents.split(',')) 
    subj_ents = filter(None, subj_ents)
    

    # print("subjs:", subj_ents)
                
    # Get action and obj_ents
    action, obj_ents = rest.split(']', maxsplit=1)
    obj_ents = recombine_chunks(obj_ents.split(','))
    obj_ents = filter(None, obj_ents)
    # print("objs:", obj_ents)
    
    # Get attributes
    # print(subj_ents, action, obj_ents)
    # for ent in subj_ents:
        # if 
        # split_attributes(ent)
        # subj_ents =[ for ent in subj_ents]
    action, action_attrs = split_attributes(action)
    # obj_ents = [split_attributes(ent) for ent in obj_ents]
    
    # Add edges
    for ent in subj_ents:
        if ent[0] == '{':
            # print("Recursing on ", ent.strip()[1:-1])
            new_edges, new_action = parse_SVO(ent.strip()[1:-1])
            edges.extend(new_edges)
            edges.append((new_action, action))
        else:
            ent, attrs = split_attributes(ent)
            # Add any attributes
            if attrs:
                for attr in attrs:
                    edges.append((attr, ent))
            # Add entity-action
            edges.append((ent, action))
    
    for ent in obj_ents:
        if ent[0] == '{':
            # print("Recursing on ", ent.strip()[1:-1])
            new_edges, new_action = parse_SVO(ent.strip()[1:-1])
            edges.extend(new_edges)
            edges.append((action, new_action))
        else:
            ent, attrs = split_attributes(ent)
            # Add any attributes
            if attrs:
                for attr in attrs:
                    edges.append((attr, ent))
            # Add entity-action
            edges.append((action, ent))
    
    if action_attrs:
        for attr in action_attrs:
            edges.append((attr, action))
    return edges, action

def parse_interactions(interaction_lines):
    edges = []
    for line in interaction_lines[:]:
        # line = interaction_lines[0]
        # print("Processing:", line)
        new_edges, _ = parse_SVO(line)
        edges.extend(new_edges)
    return edges

edges = parse_interactions(interaction_lines)
print(edges)

[('Uber', 'looking'), ('Lyft', 'looking'), ('consistent', 'lane'), ('looking', 'lane'), ('two years into the pandemic', 'looking'), ('omicron surge', 'knock'), ('Ride-hailing', 'bounced back'), ('thought to have in fourth quarter', 'bounced back'), ('knock', 'bounced back'), ('off course', 'knock'), ('could', 'knock'), ('Ride-hailing', 'seeking'), ('seeking', 'stability'), ('still', 'seeking'), ('nearly two years into pandemic', 'seeking'), ('Uber', 'release'), ('Lyft', 'release'), ('release', 'earnings preview'), ('scheduled', 'release'), ('next week', 'release'), ('earnings preview', 'show how'), ('companies', 'dealing with'), ('continuously changing', 'ride-hailing environment'), ('dealing with', 'ride-hailing environment'), ('show how', 'dealing with'), ('Lyft', 'report'), ('on Tuesday afternoon', 'report'), ('Uber', 'reports'), ('on Wednesday', 'reports'), ('Analyst data', 'showed'), ('continue', 'recovery'), ('showed', 'recovery'), ('showed', 'in rides & travel'), ('showed', 'end