In [90]:
!make install #Run command to install necessary modules
!make format
!make lint # Run a linter in Flake8

pip install -r requirements.txt
black scrapy/sayariproject/
[1mAll done! ✨ 🍰 ✨[0m
[34m7 files [0mleft unchanged.
flake8 --ignore=E121,E501,E265,F821 scrapy/sayariproject/ ./webcrawling.ipynb


In [None]:
# Run the crawler
!make crawl

In [None]:
import json
with open('scrapy/output.json', 'r') as file:
    # Loads the JSON data from the file created from 'scrapy'
    companies_data = json.load(file)

companies_data

In [98]:
import networkx as nx
G = nx.Graph() # Person or Company that is an agent / owner

def create_graphs():
    """
    Creates the graphs, with two different types of nodes: company and person

    The edges are defined through defining relation between
    owner/agent and company
    """
    for data in companies_data:
        company_name = data["Company"]
        G.add_node(company_name, type="Company")
        if "Commercial Registered Agent" in data:
            cr_agent = data["Commercial Registered Agent"].split("\n")[0]
            G.add_node(cr_agent, type="Person")
            G.add_edges_from([(company_name, cr_agent),])
        if "Registered Agent" in data:
            r_agent = data["Registered Agent"].split("\n")[0]
            G.add_node(r_agent, type="Person")
            G.add_edges_from([(company_name, r_agent),])
        if "Owner Name" in data:
            owner = data["Owner Name"]
            G.add_node(owner, type="Person")
            G.add_edges_from([(company_name, owner),])
        elif "Owners" in data:
            owner_1 = data["Owners"].split("\n")[0]
            owner_2 = data[""].split("\n")[0]
            G.add_node(owner_1, type="Person")
            G.add_node(owner_2, type="Person")
            edges = [(owner_1, company_name), (owner_2, company_name),]
            G.add_edges_from(edges)

create_graphs()

In [94]:
# Set this to False if labels on graph are too distracting
graph_show_labels = True

In [None]:
import matplotlib.pyplot as plt

def visualize_graph(type_colors):
    """
    Visualizes graphs representing relationship between owner/agent and companies
    """
    plt.figure(1, figsize=(16, 16))
    plt.title('Company and Owner/Agent Network')
    pos = nx.nx_agraph.graphviz_layout(G, prog="neato")
    components = (G.subgraph(component) for component in nx.connected_components(G))
    for sub_graph in components:
        subgraph_colors = [type_colors[G.nodes[node]['type']] for node in sub_graph.nodes()]
        nx.draw(sub_graph, pos, node_size=40, node_color=subgraph_colors, vmin=0.0, vmax=1.0, with_labels=graph_show_labels)

colors = {'Company': 'lightblue', 'Person': 'lightgreen'}

visualize_graph(colors)

In [None]:
def print_connected_component_data():
    "Prints out connected component data"
    for component in nx.connected_components(G):
        print(G.subgraph(component).nodes(data=True))

print_connected_component_data()