In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import requests

In [2]:
df = pd.read_csv('SFLow_data_sample.csv.csv', names=['Type', 'SFlowAgentAddr', 'InputPort', 'OutputPort', 'srcMAC', 'dstMAC', 'EthernetType', 'InVLAN', 'OutVLAN', 'SrcIP', 'DstIP', 'IPProtocol', 'IpTOS', 'IpTTL', 'UDPSrcPort', 'UDPDstPort', 'TCPFlags', 'PacketSize', 'IPSize', 'SamplingRate'], index_col=False)
df

Unnamed: 0,Type,SFlowAgentAddr,InputPort,OutputPort,srcMAC,dstMAC,EthernetType,InVLAN,OutVLAN,SrcIP,DstIP,IPProtocol,IpTOS,IpTTL,UDPSrcPort,UDPDstPort,TCPFlags,PacketSize,IPSize,SamplingRate
0,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
1,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
2,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
3,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
4,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,FLOW,aa.aa.aa.aa,19,21,e00eda07c0ff,28c0da0117ca,0x0800,102.0,2121,202.21.158.254,175.156.15.109,47,0x00,124,32665.0,443,0x10,1459,1437,2048
1196,FLOW,aa.aa.aa.aa,3,14,00235ed9b680,8cb64fe6b643,0x0800,691.0,691,123.136.64.58,165.21.83.88,17,0x00,121,42128.0,53,0x10,91,69,2048
1197,FLOW,aa.aa.aa.aa,3,14,00235ed9b680,8cb64fe6b643,0x0800,691.0,691,202.6.242.63,198.143.33.121,6,0x00,124,443.0,42215,0x10,1522,1500,2048
1198,FLOW,aa.aa.aa.aa,258,130,204e71cf1b0f,00239cd087c1,0x0800,537.0,919,171.67.76.38,137.132.3.10,6,0x00,56,80.0,2182,0x10,1522,1500,2048


# EXERCISE 4A: TOP TALKERS AND LISTENERS

In [3]:
import json

def get_org(ip):
    url = "https://ipapi.co/" + ip + "/json/"
    resp = requests.get(url).text

    try:
        jsonObj = json.loads(resp)
        return jsonObj['org']
    except:
        return ''

In [4]:
top_talkers = pd.DataFrame(df['SrcIP'].value_counts()).reset_index()
top_talkers.columns = ['SrcIP', 'SrcIP_Packet_Count']
top_talkers = top_talkers.nlargest(5, 'SrcIP_Packet_Count')
top_talkers['Organisation'] = top_talkers['SrcIP'].apply(get_org)
top_talkers

Unnamed: 0,SrcIP,SrcIP_Packet_Count,Organisation
0,152.3.219.19,126,DUKE-INTERCHANGE
1,207.241.228.157,66,INTERNET-ARCHIVE
2,130.14.250.13,63,NLM-GW
3,193.62.192.8,46,Jisc Services Limited
4,192.122.131.36,40,A-STAR


In [6]:
top_listeners = pd.DataFrame(df['DstIP'].value_counts()).reset_index()
top_listeners.columns = ['DstIP', 'DstIP_Packet_Count']
top_listeners = top_listeners.nlargest(5, 'DstIP_Packet_Count')
top_listeners['Organisation'] = top_talkers['SrcIP'].apply(get_org)
top_listeners

Unnamed: 0,DstIP,DstIP_Packet_Count,Organisation
0,198.71.44.98,126,DUKE-INTERCHANGE
1,103.37.198.100,98,INTERNET-ARCHIVE
2,210.48.222.9,66,NLM-GW
3,137.132.228.15,50,Jisc Services Limited
4,202.21.159.244,38,A-STAR


# EXERCISE 4B: TRANSPORT PROTOCOL

In [10]:
allProtocols = df.IPProtocol.unique()
allProtocols

array([ 50,   6,  17,   0,  47,  41,   1, 381], dtype=int64)

In [8]:
ip_protocol_dict = {
    50: "ESP",
    6: "TCP",
    17: "UDP",
    0: "HOPOPT",
    47: "GRE",
    41: "IPv6",
    1: "ICMP",
    381: "Reserved"
}

selected_protocols = ["TCP", "UDP"]

In [11]:
total_count = len(df)
for protocol in allProtocols:
    if ip_protocol_dict[protocol] in selected_protocols:
        print(ip_protocol_dict[protocol])
        packetsWithSameProtocol = df[df["IPProtocol"] == protocol]
        count = len(packetsWithSameProtocol)
        percentage = count / total_count * 100
        print("Count: ", count)
        print("Percentage: ", percentage)

TCP
Count:  879
Percentage:  73.25
UDP
Count:  135
Percentage:  11.25


# EXERCISE 4C: APPLICATIONS PROTOCOL

In [13]:
destination_df = pd.DataFrame(df["UDPDstPort"].value_counts()).reset_index()
destination_df.columns = ["Destination_IP_Port_Number", "Packet_Count"]
destination_df.sort_values(by=["Packet_Count"])
destination_df

Unnamed: 0,Destination_IP_Port_Number,Packet_Count
0,443,234
1,56152,82
2,0,82
3,43930,45
4,80,37
...,...,...
285,993,1
286,41133,1
287,59747,1
288,49317,1


# EXERCISE 4D: TRAFFIC

In [16]:
sampling_rate = 1/1000
total_size_bits = df['IPSize'].sum()
total_size_megabyte = total_size_bits / 1024 / 1024 / 8
total_traffic = total_size_megabyte / sampling_rate

print(f"Total Traffic: {total_traffic} MB")

Total Traffic: 128.7320852279663 MB


# EXERCISE 4E: ADDITIONAL ANALYSIS