In [11]:
import os
print("Files in Folder:")
print("\t"+"\n\t".join(os.listdir("../lyft_stock")))

Files in Folder:
	investing.txt
	marketwatch.txt
	marketwatch_parse.txt
	motleyfool.txt
	motleyfool_parse.txt
	schaeffers.txt
	schaeffers_parse.txt


In [12]:
# Open a file
DOC_NAME = "marketwatch"

with open(f'../lyft_stock/{DOC_NAME}_parse.txt') as f:
    lines = [l.strip() for l in f.readlines() if len(l.strip()) > 0]
print(f"Read {len(lines)} lines from {DOC_NAME} file")

Read 98 lines from marketwatch file


In [13]:
# Segment documents by entities and interactions
assert "Entities:" in lines, "Error: did not find 'Entities:' header"
assert "Interactions:" in lines, "Error: did not find 'Interactions:' header"
idx = lines.index("Interactions:")
entity_lines = lines[1:idx]
interaction_lines = lines[idx + 1:]
# print(interaction_lines)

In [29]:
# Parse entities
def parse_entities(entity_lines):
    ent_map = dict()
    for line in entity_lines:
        if '[' in line:
            ent, contents = line.split('[')
            contents = [c.strip() for c in contents[:-1].split(',')]
        else:
            ent, contents = line, []
        assert ent.strip() not in ent_map.keys(), f"Error: entity {ent} already exists in entity map"
        ent_map[ent.strip()] = list(set(contents))
    return ent_map
entities = parse_entities(entity_lines)
print(entities)

{'pandemic,': ['COVID-19 pandemic', 'pandemic'], 'uber,': ['Uber Technologies inc.', 'Uber', 'company', 'companies'], 'lyft,': ['rival Lyft', 'companies', 'Lyft Int.', 'Lyft'], 'consistent lane,': [''], 'earnings preview,': ['fourth quarter results'], 'ride-hailing,': ['ride-hailing industry', 'ride-hailing', 'ride-hailing environment'], 'fourth quarter,': ['fourth quarter'], 'omicron surge,': ['COVID-19 omicron surge'], 'two years': [], 'stability': [], 'Analyst data': [], 'rides': [], 'travel': [], 'US ride-hailing volume': [], 'YipitData analysis': [], 'Raymond James & Associates': ['', 'Raymond James analysts'], "Uber's fourth quarter bookings": [], "Lyft's daily active users": [], 'driver supply': [], 'New Year': [], 'incentives': ['incentives'], 'status of food delivery': [], "Uber's Eats division": ['Eats', 'business'], 'analysts': ['analysts'], 'FactSet': ['FactSet'], 'Estimize': ['Estimize'], 'hedge-fund managers': [], 'executives': [], 'Uber stock': ['Uber shares'], 'Lyft sto

In [30]:
# PARSING HELPERS

def split_attributes(chunk):
    # If no attributes
    if "(" not in chunk:
        return chunk, None
    
    # Otherwise get attributes
    ent, attrs = chunk.split('(')
    ent = ent.strip()
    attrs = [attr.strip() for attr in attrs[:-1].split(',')]
    return ent.strip(), attrs

def split_chunks(ent_string):
    # Typically, chunks are split on commas. However, nested actions need to be regarded as a chunk so they must be recombined
    # E.g. Stef, {Will, Neil [eat]} -> ["Stef", "{Will, Neil [eat]}""] and not ["Stef", "{Will", "Neil [eat]}"]
    
    # Split on commas    
    ent_list = ent_string.split(',')
    
    # For {} chunks, recombine them
    #TODO check for correct ordering and to handle nesting
    open_idxs = []
    close_idxs = []
    for i, ent in enumerate(ent_list):
        if '{' in ent:
            open_idxs.append(i)
        if '}' in ent:
            close_idxs.append(i)

    # Make sure # of open and close brackets are equal 
    assert len(open_idxs) == len(close_idxs), "Mismatch brackets in line: " + str(ent_list)
    for i, open_idx in enumerate(open_idxs):
        # If open and close are in same chunk, no need to merge
        if open_idx == close_idxs[i]:
            continue
        ent_list = ent_list[:open_idx] + ["".join(ent_list[open_idx:close_idxs[i] + 1])] + ent_list[close_idxs[i] + 1:]
    return [ent.strip() for ent in ent_list]

# Parse a line in the Subject-Verb-Object format "ent1, ent2,... [action] ent3, ent4...""
def parse_SVO(nodes, edges, line):
    # Get subj ents
    # TODO: Handle case: {status of food delivery [changing]} [affects] Uber
    subj_ents, rest = [l.strip() for l in line.split('[', maxsplit=1)]
    subj_ents = split_chunks(subj_ents) 
    subj_ents = filter(None, subj_ents) # Drop empty entries
    
    # Get action and obj_ents
    action, obj_ents = rest.split(']', maxsplit=1)
    obj_ents = split_chunks(obj_ents)
    obj_ents = filter(None, obj_ents) # Drop empty entries
    
    # Get attributes
    action, action_attrs = split_attributes(action)
    # Action is always appended first
    action_idx = len(nodes)
    nodes.append((action, 'interaction'))
    
    count_idx = len(nodes)
    # Add edges
    for ent in subj_ents:
        if ent[0] == '{':
            # Action is always appended first
            nodes, edges = parse_SVO(nodes, edges, ent.strip()[1:-1])
            
            edges.append((count_idx, action_idx))
            count_idx = len(nodes)
        else:
            ent, attrs = split_attributes(ent)
            ent_idx = count_idx
            nodes.append((ent, 'entity'))
            count_idx += 1
            # Add any attributes
            if attrs:
                for attr in attrs:
                    nodes.append((attr, 'attribute'))
                    count_idx += 1
                    edges.append((count_idx, ent_idx))
            
            # Add entity-action
            edges.append((ent_idx, action_idx))
    
    for ent in obj_ents:
        if ent[0] == '{':
            # Action is always appended first
            nodes, edges = parse_SVO(nodes, edges, ent.strip()[1:-1])
            
            edges.append((action_idx, count_idx))
            count_idx = len(nodes)
        else:
            ent, attrs = split_attributes(ent)
            ent_idx = count_idx
            nodes.append((ent, 'entity'))
            count_idx += 1
            # Add any attributes
            if attrs:
                for attr in attrs:
                    nodes.append((attr, 'attribute'))
                    count_idx += 1
                    edges.append((count_idx, ent_idx))
            # Add entity-action
            edges.append((action_idx, ent_idx))
            
    count = len(nodes)
    if action_attrs:
        for attr in action_attrs:
            nodes.append((attr, 'attribute'))
            edges.append((count, action_idx))
            count += 1
    return nodes, edges

In [31]:
# Parse all interactions
def parse_interactions(interaction_lines):
    nodes, edges = [], []
    for line in interaction_lines[:]:
        # line = interaction_lines[0]
        # print("Processing:", line)
        nodes, edges = parse_SVO(nodes, edges, line)
    return nodes, edges

nodes, edges = parse_interactions(interaction_lines)

In [32]:
from models import *

PARENT_DOC = "lyft-" + DOC_NAME

node_objs = []

for i, (name, type_) in enumerate(nodes):
    if type_ == 'entity':
        node_objs.append(Entity(str(i), name, PARENT_DOC))
    elif type_ == 'interaction':
        node_objs.append(Interaction(str(i), name, PARENT_DOC))
    elif type_ == 'attribute':
        node_objs.append(Attribute(str(i), name,  PARENT_DOC))
    else:
        print("Error, unrecognized type:", type_)

assert len(node_objs) == len(nodes), "Error: mismatched lengths of nodes and nodes objects"

# nodes = dict()
# for src, dest in edges:
#     for node in [src, dest]:
#         key, type_ = node
#         if type_ == 'entity':
#             if (key, type_) in nodes.keys():
#                 nodes[key, type_].raw_count += 1
#             else:
#                 nodes[key, type_] = Entity(key, PARENT_DOC)
#         elif type_ == 'interaction':
#             if (key, type_) in nodes.keys():
#                 nodes[key, type_].raw_count += 1
#             else:
#                 nodes[key, type_] = Interaction(key, PARENT_DOC)
#         elif type_ == 'attribute':
#             if (key, type_) in nodes.keys():
#                 nodes[key, type_].raw_count += 1
#             else:
#                 nodes[key, type_] = Attribute(key, PARENT_DOC)
#         else:
#             print("Error, unrecognized type:", type_)

AssertionError: Error: key must be a string

In [28]:
# Upload to DB
from graph_driver import GraphDBDriver
from models import *

driver = GraphDBDriver(remote=False)
print("Uploading nodes")
driver.upload_nodes(node_objs)
# driver.upload_nodes([node for node in nodes.values()])

print("Adding Edges")
edge_objs = []
for source, dest in edges:
    edge_objs.append(Edge("relation", node_objs[source], node_objs[dest]))
driver.upload_edges(edge_objs)
# print("Cleaning up")
# driver.raw_query("MATCH (n:entity) WHERE n.key=\"entity1\" DETACH DELETE n")
# driver.raw_query("MATCH (n:interaction) WHERE n.key=\"interacted\" DETACH DELETE n")
# print("Deleted nodes")
driver.close()
print("Finished")

Uploading nodes
Uploaded looking
Uploaded Uber
Uploaded Lyft
Uploaded lane
Uploaded consistent
Uploaded two years into the pandemic
Uploaded knock
Uploaded omicron surge
Uploaded bounced back
Uploaded Ride-hailing
Uploaded thought to have in fourth quarter
Uploaded off course
Uploaded could
Uploaded seeking
Ride-hailing already exists in database
Uploaded stability
Uploaded still
Uploaded nearly two years into pandemic
Uploaded release
Uber already exists in database
Lyft already exists in database
Uploaded earnings preview
Uploaded scheduled
Uploaded next week
Uploaded show how
earnings preview already exists in database
Uploaded dealing with
Uploaded companies
Uploaded ride-hailing environment
Uploaded continuously changing
Uploaded report
Lyft already exists in database
Uploaded on Tuesday afternoon
Uploaded reports
Uber already exists in database
Uploaded on Wednesday
Uploaded showed
Uploaded Analyst data
Uploaded recovery
Uploaded continue
Uploaded in rides & travel
Uploaded end o

In [156]:
len(nodes.values())

109