## Attain HTML for each company market index page

In [None]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
import numpy as np
import time

# All ASX200 companies
ASX200 = pd.read_csv("ASX200.csv")
ASX200_companies = list(ASX200.Code)

def html_get(ticker):
    try:
        # URL for each tickers market index page
        url = f"https://www.marketindex.com.au/asx/{ticker}"

        # Creating header to attain requests
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()

        # Created html for scraping
        html = soup(webpage, "html.parser")

        return html
    
    # HTTP errors may arise due to site security, 
    # will run until error is passed
    except:
        time.sleep(3)
        return html_get(ticker)

html_dict = {}
for ticker in ASX200_companies:
    print(ticker)
    # Obtaining tables    
    html = html_get(ticker)
    
    # Creating dictionary of ticker as key and string HTML as values
    html_dict[ticker] = str(html)     
    
# Saving created dictionary as json file
import json
with open('html_ticker.json', 'w') as fp:
    json.dump(html_dict, fp)

## Attain Insider trading tables

In [6]:
import json
import pandas as pd
import numpy as np
 
# Opening JSON file
f = open('html_ticker.json')
 
# returns JSON object as
# a dictionary
data = json.load(f)
 
# Insider dataframe stored here
insider_df_list = []
# Iterating through the json dictionary
for ticker in data:
    html = data[ticker]
    
    director = pd.read_html(str(html), attrs = {'class': 'mi-table mb-4'})
    
    # Checks to confirm that specific table exists
    check_insider = False
    check_insider_relationship = False 
    # Going through tables in html code
    for table in director:
        # If table meets the requirements of a typical insider relationship table
        if list(table.columns) == ['Name', 'Title', 'Since', 'Bio']:
            insider_relationship = table
            check_insider_relationship = True

        # If table meets the requirements of a typical insider trades table
        if list(table.columns) == ['Date', 'Director', 'Type', 'Amount', 'Price', 'Value', 'Notes']:
            insider = table
            check_insider = True

    # Evaluate tables presence
    if check_insider_relationship == False:
        print(f"{ticker} does not have insider relationship")

    if check_insider == False:
        print(f"{ticker} does not have insider trades")

    # Both required tables exist
    if check_insider_relationship == True and check_insider == True:
        
        # Split name in insider_relationship for ease of finding names
        person_name_dict = {}
        for person in insider_relationship.Name:
            person_name_dict[person] = set(person.split(" "))


        title_list, bio_list, since_list = [], [], []
        for person in insider.Director:
            # Splitting name into a list
            person_name_list= person.split(" ")

            # Seeing if name list matches name list in insider_relationship
            check = False
            for key in person_name_dict:
                items = person_name_dict[key]

                if set(person_name_list).issubset(items) == True:
                    check = True

                    # Appending insider_relationship information once confirmed a person and their infomatiuon
                    title_list.append(insider_relationship[insider_relationship["Name"] == key]["Title"].values[0])
                    bio_list.append(insider_relationship[insider_relationship["Name"] == key]["Bio"].values[0])
                    since_list.append(insider_relationship[insider_relationship["Name"] == key]["Since"].values[0])

            # If person's information cannot be found, NA is applied to that cell
            if check == False:
                title_list.append(np.nan)
                bio_list.append(np.nan)
                since_list.append(np.nan)


        # Creation of dictionary for dataframe
        insider["Title"] = title_list
        insider["Bio"] = bio_list
        insider["Since"] = since_list
        # Ticker column added to ascertain which company information relates to
        insider["Ticker"] = len(insider) * [ticker]

        insider_df_list.append(insider)
    
pd.concat(insider_df_list, axis = 0).to_csv("insider.csv")

        


AAA does not have insider trades
BWP does not have insider trades
FBU does not have insider trades
IOO does not have insider trades
IOZ does not have insider trades
IVV does not have insider trades
MGOC does not have insider trades
PMGOLD does not have insider trades
RMD does not have insider trades
SPK does not have insider trades
STW does not have insider trades
VAP does not have insider trades
VAS does not have insider trades
VEU does not have insider trades
VGS does not have insider trades
VTS does not have insider trades
VUK does not have insider trades
ZIM does not have insider trades


## Top 20 Shareholders

In [5]:
import json
import pandas as pd
import numpy as np
 
# Opening JSON file
f = open('html_ticker.json')
 
# returns JSON object as
# a dictionary
data = json.load(f)

top_20_list = []
for ticker in data:
    html = data[ticker]
    
    director = pd.read_html(str(html), attrs = {'class': 'mi-table mb-4'})
    
    for table in director:
        check = False
        if list(table.columns) == ['Name', 'Shares', 'Capital']:
            top_20 = table

            top_20["Ticker"] = len(top_20) * [ticker]
            
            top_20_list.append(top_20)
            
            check = True
    if check == False:
        print(f"{ticker} does not have top 20 shareholders table")
            
pd.concat(top_20_list, axis = 0).to_csv("shareholders.csv")

AAA does not have top 20 shareholders table
IOO does not have top 20 shareholders table
IOZ does not have top 20 shareholders table
IVV does not have top 20 shareholders table
MGOC does not have top 20 shareholders table
PMGOLD does not have top 20 shareholders table
STW does not have top 20 shareholders table
VAP does not have top 20 shareholders table
VAS does not have top 20 shareholders table
VEU does not have top 20 shareholders table
VGS does not have top 20 shareholders table
VTS does not have top 20 shareholders table


## Generate graph for insider trades

In [32]:
def graph_insider(ticker_chosen):
    import re
    import hvplot.pandas
    
    insider_df = pd.read_csv("insider.csv")
    
    insider_df["Date"] = pd.to_datetime(insider_df["Date"])
    
    price_list = list(insider_df.Value)
    
    # Change values from string to float numbers
    int_price_list = []
    for price in price_list:
        if re.findall("\(", price):
            # Removing , from values
            price = re.sub(",", "", price)
            # Removing () and $ from negative numbers
            int_price_list.append(float(price[2:-1]))
        else:
            # Removing , from values
            price = re.sub(",", "", price)
            # Removing $ from positive numbers
            int_price_list.append(float(price[1:]))
            
    insider_df["Bought/Sold"] = int_price_list
    
    # Filter for selected tickers
    hvplot_str_list = []
    for ticker in ticker_chosen:
        hvplot_str_list.append(f'insider_df[insider_df["Ticker"] == "{ticker}"].sort_values(by = ["Date"]).set_index("Date").hvplot.bar(y = "Bought/Sold", hover_cols = ["Director", "Price", "Value", "Type"], rot = 90, shared_axes = False, title = f"{ticker} Insider trades")')
        
    hvplot_str = " + ".join(hvplot_str_list) 
    return eval(hvplot_str)
        

    
graph_insider(["AAA", "SUN"])

## User created network

In [11]:
import pandas as pd
import re

def extender_color_list(color_list, length_required):
    extended_color_list = []
    while True:
        for color in color_list:
            if len(extended_color_list) == length_required:
                return extended_color_list
            else:
                extended_color_list.append(color)
                
def string_cleaner(string, words_to_remove = []):
    string = str(string).lower()
    
    string = re.sub("<.+>|\(.+\)", "", string)
    
    string_list_space = string.split(" ")
    
    string_list = [word for word in string_list_space if word != ""]
    
    string_list_words_remove = [word.capitalize() for word in string_list if word not in words_to_remove]
    
    final_string = " ".join(string_list_words_remove)
    
    return final_string

def clean_up_df(df):
    # Drop empty cells
    df_drop = df.dropna()
    
    # Drop rows which have either:
        # No shareholder information 
        # No Top 20 shareholder information 
    # These are present in the Name column
    # But also present with 0 shares
    
    df_drop_0 = df_drop[df_drop["Shares"] != 0]
    return df_drop_0

def shareholders_connection_graph(ticker_chosen):
    
    top20_raw = pd.read_csv("shareholders.csv")
    
    # Drop rows with no information
    shareholders_df = clean_up_df(top20_raw)
    
    # Set up list of tickers with information for use
    ticker_asx200 = list(set(shareholders_df["Ticker"]))

    shareholders_clean = []
    for shareholder in list(shareholders_df["Name"]):
        # Remove words that cause disimilarity between identical shareholders due to market index calling shareholders
        # by differing names
        shareholder_clean = string_cleaner(str(shareholder), ['custody', 'nominees', 'limited', 'pty', 'ltd'])
        shareholders_clean.append(shareholder_clean)

    shareholders_df["Name_clean"] = shareholders_clean

    unique_shareholders_asx200 = list(set(shareholders_df["Name_clean"]))
    
    color_list = ["#FF0000", "#FFFFFF", "#00FFFF", "#C0C0C0","#0000FF","#808080","#00008B"
                  ,"#ADD8E6","#FFA500","#800080","#A52A2A","#FFFF00","#800000","#00FF00","#008000"
                  ,"#FF00FF","#808000","#FFC0CB"]
    
    

    
    # Specifying the number of nodes present in the selected network
    node_label = []
    for ticker in ticker_chosen:
        if ticker in ticker_asx200:
            # Ticker and their nodes
            list_of_shareholders = list(shareholders_df[shareholders_df["Ticker"] == ticker]["Name_clean"])
            node_label += ([ticker] + list_of_shareholders)
        
        else:
            # Shareholders and their nodes
            list_of_shareholders = list(shareholders_df[shareholders_df["Name_clean"] == ticker]["Ticker"])
            node_label += ([ticker] + list_of_shareholders)
            
    # Finalised node label related to tickers_chosen
    node_label = set(list(node_label))

    color_list = ["#FF0000", "#FFFFFF", "#00FFFF", "#C0C0C0","#0000FF","#808080","#00008B"
                  ,"#ADD8E6","#FFA500","#800080","#A52A2A","#FFFF00","#800000","#00FF00","#008000"
                  ,"#FF00FF","#808000","#FFC0CB"]
    
    # Creating length of color list required
    extended_color_list = extender_color_list(color_list, len(node_label))
    
    # Creating range of numbers for nodes
    node_num = list(range(len(node_label)))
    
    # Dictionary to access company's node number
    company_nodes_dict = {}
    
    # Setting up network
    from pyvis.network import Network

    net = Network(notebook = True, bgcolor="#222222", font_color="white")
    
    # Creating nodes related to tickers chosen
    for node, color, label in zip(node_num, extended_color_list, node_label):
    # If label is a ticker in the ASX200
        company_nodes_dict[label] = node
        if label in ticker_asx200:
            # Accessing names related to a ticker which is present in the form of 3 capital letters
            name_list = list(shareholders_df[shareholders_df["Ticker"] == label]["Name_clean"])
            # Accessing capital percentage related to a ticker 
            capital_list = list(shareholders_df[shareholders_df["Ticker"] == label]["Capital"])

            title_node = ' Top 20 Shareholders:<br>'
            num = 0
            for shareholder, capital in zip(name_list, capital_list):
                num += 1
                title_node += f'<br>{num}) {shareholder}: {capital}'

            img = f"https://files.marketindex.com.au/xasx/96x96-png/{label.lower()}.png"

            # Creating the node
            net.add_node(n_id = node, label = label, title = title_node, image = img, shape = 'image')
        
        else:
            # Accessing names related to a non-asx200 company
            ticker_list = list(shareholders_df[shareholders_df["Name_clean"] == label]["Ticker"])
            # Accessing capital percentage related to a company 
            capital_list = list(shareholders_df[shareholders_df["Name_clean"] == label]["Capital"])

            title_node = ' Investments:<br>'
            num = 0
            for shareholder, capital in zip(ticker_list, capital_list):
                num += 1
                title_node += f'<br>{num}) {shareholder}: {capital}'

            # Creating the node
            net.add_node(n_id = node, color = color, label = label, title = title_node)

    
    # Adding network edges for each company
    for shareholder in ticker_chosen:
        if shareholder in ticker_asx200:
            shareholder_investments = list(shareholders_df[shareholders_df["Ticker"] == shareholder]["Name_clean"])
            shareholder_investments_shares = list(shareholders_df[shareholders_df["Ticker"] == shareholder]["Shares"])
            shareholder_investments_capital = list(shareholders_df[shareholders_df["Ticker"] == shareholder]["Capital"])

        else:
            shareholder_investments = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Ticker"])
            shareholder_investments_shares = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Shares"])
            shareholder_investments_capital = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Capital"])
            
        shareholder_node = company_nodes_dict[shareholder]
        for company, shares, capital in zip(shareholder_investments, shareholder_investments_shares, shareholder_investments_capital):
            company_node = company_nodes_dict[company]
            
            net.add_edge(shareholder_node, company_node, value = shares)

    net.repulsion(node_distance=300, spring_length=200)
    net.show_buttons(filter_=True)
    return net.show('list_of_nodes.html')
    
shareholders_connection_graph(["WOW", "COL", "HSBC Custody Nominees Limited"])

## Supporting table with network

In [None]:
def display_tables(ticker_chosen):
    top20_raw = pd.read_csv("shareholders.csv")
    
    # Drop rows with no information
    shareholders_df = clean_up_df(top20_raw)
    
    # Set up list of tickers with information for use
    ticker_asx200 = list(set(shareholders_df["Ticker"]))
    
    displayed_list = []
    for ticker in ticker_chosen:
        # If ticker chosen is a ASX200 company
        if ticker in ticker_asx200:
            ticker_df = shareholders_df[shareholders_df["Ticker"] == ticker]
            displayed_list.append(ticker_df)
        # If ticker chosen is a shareholder of an ASX200 company
        else:
            ticker_df = shareholders_df[shareholders_df["Name_clean"] == ticker]
            displayed_list.append(ticker_df)
            
    df = pd.concat(displayed_list, axis = 0)
    return df

## Full Network

In [12]:
def extender_color_list(color_list, length_required):
    extended_color_list = []
    while True:
        for color in color_list:
            if len(extended_color_list) == length_required:
                return extended_color_list
            else:
                extended_color_list.append(color)
                
def string_cleaner(string, words_to_remove = []):
    string = str(string).lower()
    
    string = re.sub("<.+>|\(.+\)", "", string)
    
    string_list_space = string.split(" ")
    
    string_list = [word for word in string_list_space if word != ""]
    
    string_list_words_remove = [word.capitalize() for word in string_list if word not in words_to_remove]
    
    final_string = " ".join(string_list_words_remove)
    
    return final_string

def clean_up_df(df):
    # Drop empty cells
    df_drop = df.dropna()
    
    # Drop rows which have either:
        # No shareholder information 
        # No Top 20 shareholder information 
    # These are present in the Name column
    # But also present with 0 shares
    
    df_drop_0 = df_drop[df_drop["Shares"] != 0]
    return df_drop_0

import pandas as pd
import re

shareholders_df_raw = pd.read_csv("shareholders.csv")
shareholders_df = clean_up_df(shareholders_df_raw)

ticker_asx200 = list(set(shareholders_df["Ticker"]))

shareholders_clean = []
for shareholder in list(shareholders_df["Name"]):
    shareholder_clean = string_cleaner(str(shareholder), ['custody', 'nominees', 'limited', 'pty', 'ltd'])
    shareholders_clean.append(shareholder_clean)
    
shareholders_df["Name_clean"] = shareholders_clean

unique_shareholders_asx200 = list(set(shareholders_df["Name_clean"]))

tickers_and_shareholders = ticker_asx200 + unique_shareholders_asx200

nodes_tickers_and_shareholders = list(range(len(tickers_and_shareholders)))

color_list = ["#FF0000", "#FFFFFF", "#00FFFF"
              , "#C0C0C0","#0000FF","#808080"
              ,"#00008B","#ADD8E6","#FFA500"
              ,"#800080","#A52A2A","#FFFF00"
              ,"#800000","#00FF00","#008000"
              ,"#FF00FF","#808000","#FFC0CB"]

extended_color_list = extender_color_list(color_list, len(tickers_and_shareholders))


# Creating the network
from pyvis.network import Network

net = Network(notebook = True, bgcolor="#222222", font_color="white")
#net = Network(notebook = True)

# Add specifications here
# Adding an additional node for ticker chosen
for node, color, label in zip(nodes_tickers_and_shareholders, extended_color_list, tickers_and_shareholders):
    # If label is a ticker in the ASX200
    if label in ticker_asx200:
        # Accessing names related to a ticker which is present in the form of 3 capital letters
        name_list = list(shareholders_df[shareholders_df["Ticker"] == label]["Name_clean"])
        # Accessing capital percentage related to a ticker 
        capital_list = list(shareholders_df[shareholders_df["Ticker"] == label]["Capital"])
        
        title_node = ' Top 20 Shareholders:<br>'
        num = 0
        for shareholder, capital in zip(name_list, capital_list):
            num += 1
            title_node += f'<br>{num}) {shareholder}: {capital}'
            
        img = f"https://files.marketindex.com.au/xasx/96x96-png/{label.lower()}.png"
        
        # Creating the node
        net.add_node(n_id = node, label = label, title = title_node, image = img, shape = 'image')
        
    else:
        # Accessing names related to a non-asx200 company
        ticker_list = list(shareholders_df[shareholders_df["Name_clean"] == label]["Ticker"])
        # Accessing capital percentage related to a company 
        capital_list = list(shareholders_df[shareholders_df["Name_clean"] == label]["Capital"])
        
        title_node = ' Investments:<br>'
        num = 0
        for shareholder, capital in zip(ticker_list, capital_list):
            num += 1
            title_node += f'<br>{num}) {shareholder}: {capital}'
            
        # Creating the node
        net.add_node(n_id = node, color = color, label = label, title = title_node)
    

# Creating a dictionary to know which company is which node
company_nodes_dict = {}
for company, node in zip(tickers_and_shareholders, nodes_tickers_and_shareholders):
    company_nodes_dict[company] = node 

# Adding network edges for each company
for shareholder in unique_shareholders_asx200:
    shareholder_investments = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Ticker"])
    shareholder_investments_shares = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Shares"])
    shareholder_investments_capital = list(shareholders_df[shareholders_df["Name_clean"] == shareholder]["Capital"])
    
    shareholder_node = company_nodes_dict[shareholder]
    for company, shares, capital in zip(shareholder_investments, shareholder_investments_shares, shareholder_investments_capital):
        company_node = company_nodes_dict[company]
        
        net.add_edge(shareholder_node, company_node, value = shares)
        
        
net.repulsion(node_distance=1500, spring_length=1000)
net.save_graph("ASX200_network.html")
display(net.show('ASX200.html'))