In [1]:
import pandas as pd
from ipwhois import IPWhois

## Helper Functions

In [2]:
# Identify the organisation through IP address
def get_organisation(ip_addr):
    ip = IPWhois(ip_addr)
    result = ip.lookup_rdap()
    return result.get('network', {}).get('name')

# Identify common ports
def get_service_name(port):
    port_mapping = {
        20: "FTP Data",
        21: "FTP Control",
        22: "SSH",
        23: "Telnet",
        25: "SMTP",
        53: "DNS",
        67: "DHCP Server",
        68: "DHCP Client",
        69: "TFTP",
        80: "HTTP",
        110: "POP3",
        123: "NTP",
        143: "IMAP",
        161: "SNMP",
        194: "IRC",
        389: "LDAP",
        443: "HTTPS",
        465: "SMTPS",
        514: "Syslog",
        515: "LPD",
        587: "SMTP (Submission)",
        636: "LDAPS",
        993: "IMAPS",
        995: "POP3S",
        1433: "MS SQL",
        1521: "Oracle",
        1723: "PPTP",
        3306: "MySQL",
        3389: "RDP",
        5060: "SIP",
        5432: "PostgreSQL",
        5900: "VNC",
        6379: "Redis",
        8080: "HTTP-Alt",
        8443: "HTTPS-Alt",
        8888: "Alternative HTTP",
        9000: "Custom/Development",
        27017: "MongoDB",
        25565: "Minecraft"
    }

    if port in port_mapping:
        return port_mapping[port]
    else:
        return "Dynamic/Unknown Port"

## Read in Data

In [3]:
columns = [
    "Type",
    "sflow_agent_address",
    "inputPort",
    "outputPort",
    "src_MAC",
    "dst_MAC",
    "ethernet_type",
    "in_vlan",
    "out_vlan",
    "src_IP",
    "dst_IP",
    "IP_protocol",
    "ip_tos",
    "ip_ttl",
    "src_port",
    "dst_port",
    "tcp_flags",
    "packet_size",
    "IP_size",
    "sampling_rate"
]

In [4]:
df = pd.read_csv("Data_2.csv",header=None,names=columns)
df = df[df["Type"]=="FLOW"] # Filter to only FLOW type
print(df.dtypes)
df.head()

Type                    object
sflow_agent_address     object
inputPort                int64
outputPort               int64
src_MAC                 object
dst_MAC                 object
ethernet_type           object
in_vlan                float64
out_vlan                 int64
src_IP                  object
dst_IP                  object
IP_protocol              int64
ip_tos                  object
ip_ttl                   int64
src_port               float64
dst_port                 int64
tcp_flags               object
packet_size              int64
IP_size                  int64
sampling_rate            int64
dtype: object


Unnamed: 0,Type,sflow_agent_address,inputPort,outputPort,src_MAC,dst_MAC,ethernet_type,in_vlan,out_vlan,src_IP,dst_IP,IP_protocol,ip_tos,ip_ttl,src_port,dst_port,tcp_flags,packet_size,IP_size,sampling_rate
0,FLOW,203.30.38.251,193,130,0031466b23cf,00239cd087c1,0x0800,919.0,919,74.125.10.59,137.132.38.238,17,0x00,63,443.0,62758,0x00,1396,1378,2048
1,FLOW,203.30.38.251,193,130,0031466b23cf,00239cd087c1,0x0800,919.0,919,173.194.22.215,137.132.228.29,6,0x00,63,443.0,39740,0x10,1438,1420,2048
2,FLOW,203.30.38.251,199,131,544b8cf9a7df,001cb0c88e40,0x0800,600.0,43,193.62.193.9,202.130.56.153,6,0x00,246,80.0,35794,0x18,272,250,2048
3,FLOW,203.30.38.251,193,131,0031466b23cf,001cb0c88e40,0x0800,919.0,43,74.125.130.132,192.122.131.33,6,0x00,52,443.0,5287,0x10,1418,1400,2048
4,FLOW,203.30.38.251,129,193,00135f21bc80,0031466b23cf,0x0800,11.0,919,155.69.160.69,74.125.68.139,17,0x60,59,48361.0,443,0x00,89,67,2048


## Top 5 Talkers

In [5]:
top_talker = df.dropna(subset=['src_IP', 'dst_IP']) # Drop data with no src and dst ip
top_talker = top_talker['src_IP'].value_counts().head(5)

# Convert Back to dataframe
top_talker = top_talker.reset_index()
top_talker.columns = ['src_IP', 'count'] # Rename column

# Get organisations
top_talker['organization'] = top_talker['src_IP'].apply(get_organisation)

In [6]:
top_talker.head()

Unnamed: 0,src_IP,count,organization
0,13.107.4.50,5960,MSFT
1,130.14.250.7,4034,NLM-ETHER
2,155.69.160.38,3866,NTUNET1
3,171.67.77.19,2656,NETBLK-SUNET
4,155.69.199.255,2587,NTUNET1


## Top 5 Listener

In [11]:
top_listener = df.dropna(subset=['src_IP', 'dst_IP']) # Drop data with no src and dst ip
top_listener = top_listener['dst_IP'].value_counts().head(5)

# Convert Back to dataframe
top_listener = top_listener.reset_index()
top_listener.columns = ['dst_IP', 'Number of Packets'] # Rename column

# Get organisations
top_listener['organization'] = top_listener['dst_IP'].apply(get_organisation)

In [12]:
top_listener.head()

Unnamed: 0,dst_IP,Number of Packets,organization
0,137.132.228.33,5908,NUSNET
1,192.122.131.36,4662,A-STAR-AS-AP
2,202.51.247.133,4288,NUSGP
3,137.132.228.29,4022,NUSNET
4,103.37.198.100,3741,A-STAR-AS-AP


## Top 5 Applications

In [9]:
top_apps = df['dst_port']
top_apps = top_apps.dropna()# Drop data with no dst port
top_apps = top_apps.value_counts().head(5) 

# Convert Back to Dataframe
top_apps = top_apps.reset_index()
top_apps.columns = ['dst_port','Number of Packets']

# Get Service Name
top_apps["Service Name"] = top_apps['dst_port'].apply(get_service_name)

In [10]:
top_apps.head()

Unnamed: 0,dst_port,count,Service Name
0,443,43208,HTTPS
1,80,11018,HTTP
2,50930,2450,Dynamic/Unknown Port
3,15000,2103,Dynamic/Unknown Port
4,8160,1354,Dynamic/Unknown Port


## Total Traffic

## Proportion of TCP and UDP packets

## Top 5 communication pair

## Visualizing the communication between different IP hosts.