# Install and import necessary packages

In [None]:
# Install packages
!pip install mrtparse
!pip install networkx
!pip install neo4j

Collecting mrtparse
  Downloading mrtparse-2.2.0-py2.py3-none-any.whl (19 kB)
Installing collected packages: mrtparse
Successfully installed mrtparse-2.2.0
Collecting neo4j
  Downloading neo4j-5.14.0.tar.gz (192 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.4/192.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-5.14.0-py3-none-any.whl size=265476 sha256=a4256a1ce57f9da6d608d3414f56d59cc686221dfb8423ca376a609545955573
  Stored in directory: /root/.cache/pip/wheels/53/e0/d7/603097e3fed62f821523433801c09e04cd7a7610c7565bd5a3
Successfully built neo4j
Installing collected packages: neo4j


In [2]:
import mrtparse
import networkx as nx
import os
import requests
import gzip
import shutil
from neo4j import GraphDatabase

# Create the Bviews Database

In [3]:
# Define a function to create the graph in Neo4j
def create_graph(tx, from_node_id, to_node_id, node_name):
  node_label = f"Node_{node_name}"
  query = (
      f"MERGE (fromNode:{node_label} {{id: $from_node_id}}) "
      f"MERGE (toNode:{node_label} {{id: $to_node_id}}) "
      f"MERGE (fromNode)-[:CONNECTS_{node_name}]->(toNode)"
  )
  print(query)
  tx.run(query, from_node_id=from_node_id, to_node_id=to_node_id,node_name=node_name)

In [4]:
# Connect to the Neo4j database
uri = "neo4j+s://28e5b287.databases.neo4j.io"  # Change to match your Neo4j server settings
username = "neo4j"      # Change to your Neo4j username
password = "z9J3DPCCxGYELn99XDdFbFIBnWOwR5fdn4MiG_Nvdck"      # Change to your Neo4j password

# Create a Neo4j driver
driver = GraphDatabase.driver(uri, auth=(username, password))

In [8]:
server_url = "https://data.ris.ripe.net"
rrc_name = "rrc00"
year = "2017"
month = "09"
date = "01"
time = "0800"

node_name = "{}{}{}{}".format(year,month,date,time)
zip_file_name = "bview.{}{}{}.{}.gz".format(year,month,date,time)
filename = "bview.{}{}{}.{}".format(year,month,date,time)

remote_url = server_url + "/" + rrc_name + "/" + year + "." + month + "/" + zip_file_name
print(remote_url)

https://data.ris.ripe.net/rrc00/2017.09/bview.20170901.0800.gz


In [12]:
r = requests.get(remote_url, allow_redirects=True)

if(r.status_code==200):

    ## Removing the zip file if it exists
    if os.path.isfile(zip_file_name):
        os.remove(zip_file_name)

    ## Removing content file if it exists
    if os.path.isfile(filename):
        os.remove(filename)

    ## Obtaining the zip file
    with open(zip_file_name, 'wb') as f:
        f.write(r.content)

    ## Obtaining the content file
    with gzip.open(zip_file_name, 'rb') as f_in:
        with open(filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    ## Removing the zip file after extraction
    if os.path.isfile(zip_file_name):
        os.remove(zip_file_name)
else:
    print('File does not exist')

In [13]:
check_list = []
check_json = []

with driver.session() as session:
    i = 0
    for entry in Reader(filename):


        curr_json = entry.data
        #check_json.append(curr_json)

        if list(curr_json['subtype'].values())[0]== "RIB_IPV4_UNICAST":

            if 'rib_entries' in curr_json.keys():

                entry_count = curr_json.get('entry_count',0)
                for j in range(entry_count):
                    curr_list = curr_json['rib_entries'][j]['path_attributes'][1]['value'][0]['value']
                    #check_list.append(curr_list)

                    N = len(curr_list)
                    for k in range(N-1):
                      if(curr_list[k+1]!=curr_list[k]): ## Avoiding same node cycles
                        session.write_transaction(create_graph, curr_list[k+1], curr_list[k], node_name)
            else:
                curr_list = curr_json['path_attributes'][1]['value'][0]['value']
                #check_list.append(curr_list)

                N = len(curr_list)
                for k in range(N-1):
                  if(curr_list[k+1]!=curr_list[k]):
                    session.write_transaction(create_graph, curr_list[k+1], curr_list[k], node_name)


            i += 1

        if(i==1):
            break

## Removing content file after reading
if os.path.isfile(filename):
    os.remove(filename)

  session.write_transaction(create_graph, curr_list[k+1], curr_list[k])


# Add Attributes

In [None]:
# Parse ASN report from url
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm

In [11]:
def fetch_all_ids(driver):
    """
    Fetch all node IDs from a Neo4j database.

    This function queries the database to retrieve all IDs of the nodes
    and prepends 'AS' to each ID before returning them in a list.

    Args:
    driver: A Neo4j driver instance used to connect to the database.

    Returns:
    A list of strings, each representing a node ID with 'AS' prepended.
    """
    with driver.session() as session:
        result = session.run("MATCH (n) RETURN n.id AS id")
        return ["AS" + record["id"] for record in result]

# Fetch and print all IDs
try:
    ids = fetch_all_ids(driver)
    ids = list(set(ids))
except:
    print('Cannot fetch the ids from graphdatabase')

In [None]:
def clean_as_number(as_number):
    return re.sub(r'\D', '', as_number) if as_number else None

# URL from which to scrape content
url = 'https://bgp.potaroo.net/cidr/autnums.html'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the <pre> tag
    pre_tag = soup.find('pre')
    if pre_tag:
        # Find all <a> tags within the <pre> tag
        a_tags = pre_tag.find_all('a')

        # List to store each row of data
        data = []

        # Extract the href attribute and text from each <a> tag
        for a_tag in a_tags:
            href = a_tag.get('href')
            text = a_tag.get_text(strip=True)
            sibling_text = a_tag.next_sibling

            # Append as a tuple to the data list
            data.append((href, text, sibling_text))

        # Create a DataFrame
        df_url = pd.DataFrame(data, columns=['Link', 'Text', 'Sibling Text'])[1:]

        # Optionally, save the DataFrame to a file, e.g., CSV
        # df_url.to_csv('output.csv', index=False)

    else:
        print("<pre> tag not found in the HTML.")

else:
    print("Failed to retrieve the webpage")


In [None]:
base_url = 'https://bgp.potaroo.net'
# Check the raw parsing data
df_filter = df_url[df_url['Text'].isin(ids)]

dic = {}
for url in df_filter['Link']:
    cur_url = base_url + url

    response = requests.get(cur_url)

    # Check if the request was successful
    if response.status_code == 200:
    # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        ul_tags = soup.find_all('ul')
        text = ""
        for ul in ul_tags:
          text += ul.get_text()
    # Regular expressions to extract the data
    as_number_pattern = r"ASNumber:\s+(\d+)|aut-num:\s+AS(\d+)"
    as_name_pattern = r"ASName:\s+(.+)|as-name:\s+(.+)"
    org_name_pattern = r"OrgName:\s+(.+)|org-name:\s+(.+)"
    country_pattern = r"(?i)Country:\s+(.+)"
    city_pattern = r"City:\s+(.+)"
    state_pattern = r"StateProv:\s+(.+)|State:\s+(.+)"
    adjacent_asn_pattern = r"Upstream Adjacent AS list\n(.+)"
    upstream_pattern = r"Upstream:\s+(\d+)"
    downstream_pattern = r"Downstream:\s+(\d+)"
    rank_pattern = r"Rank\s+AS\s+Type\s+Originate Addr Space\s+\(pfx\)\s+Transit Addr space\s+\(pfx\)\s+Description\n(\d+)"
    if re.search(as_number_pattern, text):
        # Extracting data using regex with checks
        as_number = re.search(as_number_pattern, text).group()
        as_number = clean_as_number(as_number)
        as_name = re.search(as_name_pattern, text).group(1) if re.search(as_name_pattern, text).group(1) else re.search(as_name_pattern, text).group(2)
        org_name_search = re.search(org_name_pattern, text)
        if org_name_search:
            org_name = org_name_search.group(1) if org_name_search.group(1) else org_name_search.group(2)
        else:
            org_name = None
        #org_name = re.search(org_name_pattern, text).group(1) if re.search(org_name_pattern, text).group(1) else re.search(org_name_pattern, text).group(2)
        country = re.search(country_pattern, text).group(1) if re.search(country_pattern, text) else None
        city = re.search(city_pattern, text).group(1) if re.search(city_pattern, text) else None
        state = re.search(state_pattern, text).group(1) if re.search(state_pattern, text) else None
        upstream = re.search(upstream_pattern, text).group(1) if re.search(upstream_pattern, text) else None
        downstream = re.search(downstream_pattern, text).group(1) if re.search(downstream_pattern, text) else None
        rank = re.search(rank_pattern, text).group(1) if re.search(rank_pattern, text) else None

        dic[as_number] = [as_name,org_name, country, city,state,upstream,downstream, rank]

data_tuples = [(key, *values) for key, values in dic.items()]
df = pd.DataFrame(data_tuples,columns=["ASN","as_name","org_name", "country", "city","state", "upstream","downstream", "rank"] )
# Save it into csv file
df.to_csv('data/attri.csv', index = False)

In [None]:
# Update the Neo4j Aura database
def update_node(driver, attributes):
    with driver.session() as session:
        session.run(
            """
            MATCH (n) WHERE n.id = $ASN
            SET n.as_name = $as_name, n.org_name = $org_name, n.country = $country,
                n.city = $city, n.state = $state,n.upstream = $upstream,
                n.downstream = $downstream, n.rank = $rank
            """,
            **attributes
        )

try:
    for _, row in df.iterrows():
        update_node(driver, row.to_dict())
finally:
    driver.close()

# Optional(Add BGP update Message to another GraphDatabase)


In [None]:
def download_gz_file(url, filename):
    """
    Download a .gz file from the given URL and save it as filename.
    """
    response = requests.get(url, stream=True)
    # Check if the response was successful
    if response.status_code == 200:
        # Save the file to disk
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")
    else:
        # Print an error message if the download failed
        print(f"Failed to download. Status code: {response.status_code}")

In [None]:
import datetime

now = datetime.datetime.now()
today = datetime.date.today()
month = now.month
year = now.year
hour = now.hour

base_url = "https://data.ris.ripe.net/rrc11/"
url = base_url + str(year) + "." + str(month)
part1 = now.strftime("%Y%m%d")
part2 = now.strftime("%H00")
url += "/updates." + part1 + "." + part2 + ".gz"

download_gz_file(url, f'{today}.gz')

In [None]:
# unzip the file and saved the data in a list

with gzip.open(f'{today}.gz', 'rb') as f_in:
  with open(f'{today}.txt', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

message_list = []
# Reader is from mrtparse package, it works as unstructure the mrt format. 
for entry in Reader(f'{today}.txt'):
    message_list.append(entry.data)

### Parse the data from the each message in List

There are four types of patterns: keepalive, update, open, notification

In [None]:
# Function to process each OrderedDict
def process_ordered_dict(tx, bgp_data,message_id):
    # Convert all keys to strings and check for 'bgp_message'
    bgp_data = {str(key): value for key, value in bgp_data.items()}
    if 'bgp_message' not in bgp_data:
        # Skip this row if 'bgp_message' is not present
        return

    local_as = bgp_data.get('local_as')
    peer_as = bgp_data.get('peer_as')
    timestamp = list(bgp_data['timestamp'].values())[0]

    bgp_message = bgp_data['bgp_message']
    message_type = list(bgp_message['type'].values())[0]
    message_length = bgp_message.get('length')

    # Create or merge Local AS and Peer AS nodes
    tx.run("MERGE (localAS:AS {id: $local_as})", local_as=local_as)
    tx.run("MERGE (peerAS:AS {id: $peer_as})", peer_as=peer_as)

    if message_type == 'UPDATE':
      withdrawn_routes_length = len(bgp_message.get('withdrawn_routes', []))
      nlri_length = len(bgp_message.get('nlri', []))
      is_withdrawal = withdrawn_routes_length > 0
      is_announcement = nlri_length > 0

      announcements_history = {}
      # Create relationship based on message type
      if is_withdrawal:
        withdrawn_list = [x['prefix'] for x in bgp_message['withdrawn_routes']]
        tx.run("""
              MATCH (localAS:AS {id: $local_as}), (peerAS:AS {id: $peer_as})
              CREATE (localAS)-[r:WITHDRAWS_ROUTE_TO {m_id: $message_id}]->(peerAS)
              SET r.timestamp = $timestamp, r.message_type = $message_type, r.message_length = $message_length, r.routes_length = $withdrawn_routes_length, r.withdrawn_list = $withdrawn_list
              """, local_as=local_as, peer_as=peer_as,timestamp=timestamp,
               message_type=message_type, message_length=message_length,
               withdrawn_routes_length=withdrawn_routes_length,message_id=message_id, withdrawn_list=withdrawn_list)
      if is_announcement:
        path_seq = list(bgp_message['path_attributes'][1]['value'][0]['value'])
        prefix = bgp_message['nlri'][0]['prefix']
        announcement_key = (local_as, peer_as, prefix)
        attributes = [str(x) for x in list(bgp_message.get('path_attributes',[]))]
        # Check for new, duplicate, or implicit withdrawal
        if announcement_key not in announcements_history:
            category = 'new_announcement'
        elif announcements_history[announcement_key] == attributes:
            category = 'duplicate_announcement'
        else:
            category = 'implicit_withdrawal'
        tx.run("""
              MATCH (localAS:AS {id: $local_as}), (peerAS:AS {id: $peer_as})
              CREATE (localAS)-[r:ANNOUNCES_ROUTE_TO {m_id: $message_id, category:$category}]->(peerAS)
              SET r.timestamp = $timestamp, r.message_type = $message_type, r.message_length = $message_length,
              r.routes_length = $nlri_length, r.path_seq = $path_seq
              """, local_as=local_as, peer_as=peer_as, timestamp=timestamp,
               message_type=message_type, message_length=message_length,
               nlri_length=nlri_length,message_id=message_id, path_seq=path_seq, category=category)
    elif message_type == 'OPEN':
      # Create relationship based on message type
      open_version = bgp_message.get('version')
      open_local_as = bgp_message.get('local_as')
      open_holdtime = bgp_message.get('holdtime')
      open_bgp_id = bgp_message.get('bgp_id')
      # Handle OPEN message
      tx.run("""
            MATCH (localAS:AS {id: $local_as}), (peerAS:AS {id: $peer_as})
            CREATE (localAS)-[r:OPENS_CONNECTION_TO {m_id: $message_id}]->(peerAS)
            SET r.timestamp = $timestamp, r.message_length = $message_length,
                r.version = $version, r.open_local_as = $open_local_as,
                r.holdtime = $holdtime, r.bgp_id = $bgp_id
            """, local_as=local_as, peer_as=peer_as, timestamp=timestamp,
                message_length=message_length, version=open_version,
                open_local_as=open_local_as, holdtime=open_holdtime, bgp_id=open_bgp_id,message_id=message_id)
    elif message_type == 'NOTIFICATION':
      # Handle NOTIFICATION message
      error_code = list(bgp_message['error_code'].values())[0]
      error_subcode = list(bgp_message.get('error_subcode'))[0]
      # Handle NOTIFICATION message
      tx.run("""
            MATCH (localAS:AS {id: $local_as}), (peerAS:AS {id: $peer_as})
            CREATE (localAS)-[r:SENDS_NOTIFICATION_TO {m_id: $message_id}]->(peerAS)
            SET r.timestamp = $timestamp, r.message_length = $message_length,
                r.error_code = $error_code, r.error_subcode = $error_subcode
            """, local_as=local_as, peer_as=peer_as, timestamp=timestamp,
                message_length=message_length, error_code=error_code,
                error_subcode=error_subcode,message_id=message_id)
    elif message_type == 'KEEPALIVE':
      # Handle KEEPALIVE message
      tx.run("""
            MATCH (localAS:AS {id: $local_as}), (peerAS:AS {id: $peer_as})
            CREATE (localAS)-[r:SENDS_KEEPALIVE_TO {m_id: $message_id}]->(peerAS)
            SET r.timestamp = $timestamp, r.message_length = $message_length
            """, local_as=local_as, peer_as=peer_as, timestamp=timestamp, message_length=message_length,message_id=message_id)
# Connect to Neo4j
driver = GraphDatabase.driver("neo4j+s://896675cf.databases.neo4j.io", auth=("neo4j", "vDR6onHDPNATBPsRKm4Crvk7EDSii3AuNTKxX_DZgdM"))

# Process each OrderedDict
with driver.session() as session:
  for i, bgp_dict in enumerate(tqdm(message_list, desc="Processing BGP Messages")):
        session.execute_write(process_ordered_dict, bgp_dict, message_id=i)
driver.close()