## Attain HTML for each company market index page

In [None]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
import numpy as np
import time

ASX200 = pd.read_csv("ASX200.csv")
ASX200_companies = list(ASX200.Code)

def html_get(ticker):
    try:
        url = f"https://www.marketindex.com.au/asx/{ticker}"

        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()

        # Created html for scraping
        html = soup(webpage, "html.parser")

        return html
    
    except:
        time.sleep(3)
        return html_get(ticker)

html_dict = {}
for ticker in ASX200_companies:
    print(ticker)
    # Obtaining tables    
    html = html_get(ticker)

    html_dict[ticker] = str(html)     
    
import json
with open('html_ticker.json', 'w') as fp:
    json.dump(html_dict, fp)

## Attain Top 20 Shareholders and Insider trading tables

In [None]:
import json
import pandas as pd
import numpy as np
 
# Opening JSON file
f = open('html_ticker.json')
 
# returns JSON object as
# a dictionary
data = json.load(f)
 
# Insider dataframe stored here
insider_df_list = []
# Iterating through the json dictionary
for ticker in data:
    html = data[ticker]
    
    director = pd.read_html(str(html), attrs = {'class': 'mi-table mb-4'})
    
    # Checks to confirm that specific table exists
    check_insider = False
    check_insider_relationship = False 
    # Going through tables in html code
    for table in director:

        if list(table.columns) == ['Name', 'Title', 'Since', 'Bio']:
            insider_relationship = table
            check_insider_relationship = True

        if list(table.columns) == ['Date', 'Director', 'Type', 'Amount', 'Price', 'Value', 'Notes']:
            insider = table
            check_insider = True

    if check_insider_relationship == False:
        print(f"{ticker} does not have insider relationship")

    if check_insider == False:
        print(f"{ticker} does not have insider trades")


    # Split name in insider_relationship for ease of finding names
    person_name_dict = {}
    for person in insider_relationship.Name:
        person_name_dict[person] = set(person.split(" "))


    title_list, bio_list, since_list = [], [], []
    for person in insider.Director:
        # Splitting name into a list
        person_name_list= person.split(" ")

        # Seeing if name list matches name list in insider_relationship
        check = False
        for key in person_name_dict:
            items = person_name_dict[key]

            if set(person_name_list).issubset(items) == True:
                check = True

                # Appending insider_relationship information
                title_list.append(insider_relationship[insider_relationship["Name"] == key]["Title"].values[0])
                bio_list.append(insider_relationship[insider_relationship["Name"] == key]["Bio"].values[0])
                since_list.append(insider_relationship[insider_relationship["Name"] == key]["Since"].values[0])

        if check == False:
            title_list.append(np.nan)
            bio_list.append(np.nan)
            since_list.append(np.nan)



    insider["Title"] = title_list
    insider["Bio"] = bio_list
    insider["Since"] = since_list
    insider["Ticker"] = len(insider) * [ticker]

    insider_df_list.append(insider)
        
pd.concat(insider_df_list, axis = 0).to_csv("insider.csv")

## Generate graph for insider trades

In [None]:
def graph_insider(ticker_chosen):
    import re
    import hvplot.pandas
    
    insider_df = pd.read_csv("insider.csv")
    
    # Filter for selected ticker
    insider_of_ticker_chosen_df = insider_df[insider_df["Ticker"] == ticker_chosen].set_index("Date")
    
    # Change number from string type to interger
    price_list = insider_of_ticker_chosen_df["Value"]
    
    int_price_list = []
    for price in price_list:
        if re.findall("\(", price):
            # Removing , from values
            price = re.sub(",", "", price)
            # Removing () and $ from negative numbers
            int_price_list.append(float(price[2:-1]))
        else:
            # Removing , from values
            price = re.sub(",", "", price)
            # Removing $ from positive numbers
            int_price_list.append(float(price[2:]))
            
    insider_of_ticker_chosen_df["Bought/Sold"] = int_price_list
    
    display(insider_of_ticker_chosen_df.hvplot.bar(y = "Bought/Sold", hover_cols = ["Director", "Price", "Value", "Type"]))
    
graph_insider("AAA")

## Setting up shareholders csv for network 

In [None]:
def extender_color_list(color_list, length_required):
    extended_color_list = []
    while True:
        for color in color_list:
            if len(extended_color_list) == length_required:
                return extended_color_list
            else:
                extended_color_list.append(color)

def shareholders_connection_graph(ticker_chosen):
    import pandas as pd
    import re
    
    top20 = pd.read_csv("shareholders.csv")
    
    color_list = ["#FF0000", "#FFFFFF", "#00FFFF", "#C0C0C0","#0000FF","#808080","#00008B","#000000"
                  ,"#ADD8E6","#FFA500","#800080","#A52A2A","#FFFF00","#800000","#00FF00","#008000"
                  ,"#FF00FF","#808000","#FFC0CB"]
    

    
    # Ticker and their nodes created with their weights (Capital)
    list_of_shareholders = list(top20[top20["Ticker"] == ticker_chosen]["Name"])
    list_of_weights = list(top20[top20["Ticker"] == ticker_chosen]["Capital"])
    list_of_node_num = list(range(len(list_of_shareholders)))

    extended_color_list = extender_color_list(color_list, len(list_of_node_num) + 1)
    
    from pyvis.network import Network

    net = Network(notebook = True, bgcolor="#222222", font_color="white")
    
    # Adding an additional node for ticker chosen
    net.add_nodes(list_of_node_num + [list_of_node_num[-1] + 1], label=list_of_shareholders + [ticker_chosen], color = extended_color_list)
    
    # Adding edges with weights to display top shareholders
    ticker_chosen_node_num = list_of_node_num[-1] + 1
    for shareholders, weight, node in zip(list_of_shareholders, list_of_weights, list_of_node_num):
        weight = float(re.sub("%", "", weight))
        net.add_edge(node, ticker_chosen_node_num, value = weight)

    net.repulsion(node_distance=300, spring_length=200)
    net.show_buttons(filter_=True)
    display(net.show('list_of_nodes.html'))
    
shareholders_connection_graph("SUN")

## Full Network

In [None]:
def extender_color_list(color_list, length_required):
    extended_color_list = []
    while True:
        for color in color_list:
            if len(extended_color_list) == length_required:
                return extended_color_list
            else:
                extended_color_list.append(color)
                
def string_cleaner(string):
    return " ".join([word for word in re.sub("<.+>|\(.+\)", "", string).strip().split(" ") if word != ""])

def clean_up_df(df):
    # Drop empty cells
    df_drop = df.dropna()
    
    # Drop rows which have either:
        # No shareholder information 
        # No Top 20 shareholder information 
    # These are present in the Name column
    # But also present with 0 shares
    
    df_drop_0 = df_drop[df_drop["Shares"] != 0]
    return df_drop_0

import pandas as pd
import re

shareholders_df_raw = pd.read_csv("shareholders.csv")
shareholders_df = clean_up_df(shareholders_df_raw)

ticker_asx200 = list(set(shareholders_df["Ticker"]))

shareholders_clean = []
for shareholder in list(shareholders_df["Name"]):
    shareholder_clean = string_cleaner(str(shareholder))
    shareholders_clean.append(shareholder_clean)
    
shareholders_df["Name_clean"] = shareholders_clean

unique_shareholders_asx200 = list(set(shareholders_df["Name_clean"]))

tickers_and_shareholders = ticker_asx200 + unique_shareholders_asx200

nodes_tickers_and_shareholders = list(range(len(tickers_and_shareholders)))

color_list = ["#FF0000", "#FFFFFF", "#00FFFF"
              , "#C0C0C0","#0000FF","#808080"
              ,"#00008B","#ADD8E6","#FFA500"
              ,"#800080","#A52A2A","#FFFF00"
              ,"#800000","#00FF00","#008000"
              ,"#FF00FF","#808000","#FFC0CB"]

extended_color_list = extender_color_list(color_list, len(tickers_and_shareholders))


# Creating the network
from pyvis.network import Network

net = Network(notebook = True, bgcolor="#222222", font_color="white")
#net = Network(notebook = True)

# Add specifications here
# Adding an additional node for ticker chosen
for node, color, label in zip(nodes_tickers_and_shareholders, extended_color_list, tickers_and_shareholders):
    # If label is a ticker in the ASX200
    if label in ticker_asx200:
        # Accessing names related to a ticker which is present in the form of 3 capital letters
        name_list = list(shareholders_df[shareholders_df["Ticker"] == label]["Name_clean"])
        # Accessing capital percentage related to a ticker 
        capital_list = list(shareholders_df[shareholders_df["Ticker"] == label]["Capital"])
        
        title_node = ' Top 20 Shareholders:<br>'
        num = 0
        for shareholder, capital in zip(name_list, capital_list):
            num += 1
            title_node += f'<br>{num}) {shareholder}: {capital}'
            
        img = f"https://files.marketindex.com.au/xasx/96x96-png/{label.lower()}.png"
        
        # Creating the node
        net.add_node(n_id = node, label = label, title = title_node, image = img, shape = 'image')
        
    else:
        # Accessing names related to a non-asx200 company
        ticker_list = list(shareholders_df[shareholders_df["Name_clean"] == label]["Ticker"])
        # Accessing capital percentage related to a company 
        capital_list = list(shareholders_df[shareholders_df["Name_clean"] == label]["Capital"])
        
        title_node = ' Investments:<br>'
        num = 0
        for shareholder, capital in zip(ticker_list, capital_list):
            num += 1
            title_node += f'<br>{num}) {shareholder}: {capital}'
            
        # Creating the node
        net.add_node(n_id = node, color = color, label = label, title = title_node)
    

# Creating a dictionary to know which company is which node
company_nodes_dict = {}
for company, node in zip(tickers_and_shareholders, nodes_tickers_and_shareholders):
    company_nodes_dict[company] = node 

# Adding network edges for each company
for shareholder in unique_shareholders_asx200:
    shareholder_investments = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Ticker"])
    shareholder_investments_shares = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Shares"])
    shareholder_investments_capital = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Capital"])
    
    shareholder_node = company_nodes_dict[shareholder]
    for company, shares, capital in zip(shareholder_investments, shareholder_investments_shares, shareholder_investments_capital):
        company_node = company_nodes_dict[company]
        
        net.add_edge(shareholder_node, company_node, value = shares)
        
        
net.repulsion(node_distance=1500, spring_length=1000)
net.save_graph("ASX200_network.html")
display(net.show('ASX200.html'))