# Objective: 

The aim is to analyze network traffic to detect any anomalies or suspicious activities.

## Data: 

CICIDS 2017 dataset, which includes a wide range of attacks and normal traffic. It's available here:

https://www.unb.ca/cic/datasets/ids-2017.html

## Procedure: 

Acquire from https://www.unb.ca/cic/datasets/ids-2017.html.

Preprocess

Exploratory data analysis

Classification machine learning algorithms:
- Logistic Regression
- Random Forest
- Gradient Boosting
- XGBoost


## Resources:

### Websites:

https://www.studytonight.com/network-programming-in-python/analyzing-network-traffic
https://plainenglish.io/blog/network-traffic-analysis-with-python-f95ed4e76c28

#### pcap files into DataFrames:

https://www.automox.com/blog/visualizing-network-data-using-python-part-1

https://www.automox.com/blog/visualizing-network-data-using-python-part-2

https://www.automox.com/blog/visualizing-network-data-using-python-part-3

#### Network Traffic Visualization (Geolocation):
https://medium.com/vinsloev-academy/python-cybersecurity-network-tracking-using-wireshark-and-google-maps-2adf3e497a93

#### Examples for malware traffic analysis:
https://www.malware-traffic-analysis.net/2021/index.html

#### Specific indicator of compromise:
https://cylab.be/blog/245/network-traffic-analysis-with-python-scapy-and-some-machine-learning

### YouTube:
https://www.youtube.com/watch?v=oA7QhYOhW_0
https://www.youtube.com/watch?v=xuNuy8n8u-Y

### LinkedIn Learning:
https://www.linkedin.com/learning/applied-ai-for-it-operations-aiops/network-traffic-analysis

### Books:
https://www.techtarget.com/searchnetworking/feature/Learn-how-to-master-network-traffic-analysis-with-Python

Research paper:
https://www.scitepress.org/papers/2018/66398/66398.pdf

### Current Tasks
- Practise reading pcap files.
- Figure out a way to capture pcap files from the notebook.
- Identify indicators of compromise.
- Parallel processing?

## Code:

In [1]:
#Imports
from scapy.all import *
from scapy.utils import PcapReader
import os, psutil, statistics, multiprocessing
import pandas as pd
import numpy as np
from prepare import *
from analize import *


In [2]:
#Preferences
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:.3f}')

In [3]:
#Get traffic data in CSV format:
load_data = False
if load_data == True:
    mon0 = fixing_col_names(pd.read_csv("csv_files/Monday-WorkingHours.pcap_ISCX.csv"))
    tues0 = fixing_col_names(pd.read_csv("csv_files/Tuesday-WorkingHours.pcap_ISCX.csv"))
    wed0 = fixing_col_names(pd.read_csv("csv_files/Wednesday-WorkingHours.pcap_ISCX.csv"))
    thur0 = fixing_col_names(pd.read_csv("csv_files/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv"))
    thur1 = fixing_col_names(pd.read_csv("csv_files/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"))
    fri0 = fixing_col_names(pd.read_csv("csv_files/Friday-WorkingHours-Morning.pcap_ISCX.csv"))
    fri1 = fixing_col_names(pd.read_csv("csv_files/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"))
    fri2 = fixing_col_names(pd.read_csv("csv_files/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"))

In [4]:
#Measuring size of pcap files:
for file in os.listdir("pcap_files"):
    print(f"Pcap file size: {os.path.getsize(f'pcap_files//{file}')/1_000_000_000:.3f} GB")
#Monday has 11,709,971 packets and is 10.823 GB.
#Tuesday has 11,551,954 packets and is 11.048 GB
#Wednesday has 13,788,878 packets and is 13.421 GB.
#Thursday has 9,322,025 packets and is 8.3 GB.
#Friday has 9,997,874 packets and is 8.839 GB.
#test0 has 67036 packets and is 47256430 bites
#System memory:
print(f"Available memory: {psutil.virtual_memory()[1]/1_000_000_000:.3f} GB")

Pcap file size: 8.839 GB
Pcap file size: 10.823 GB
Pcap file size: 0.047 GB
Pcap file size: 8.303 GB
Pcap file size: 11.048 GB
Pcap file size: 13.421 GB
Available memory: 28.187 GB


In [5]:
#`PcapReader()` iterates though the pcap entries. Creates generator object.
# You must create a new generator object everytime it is used.
#`rdpcap()` loads all pcap entries in memory at once. Takes a very long time and can fill up memory.

In [6]:
print('Physical Cores=', psutil.cpu_count(logical=False))
print('Logical Cores=',  psutil.cpu_count(logical=True))

Physical Cores= 6
Logical Cores= 12


In [7]:
#Generator object for pcap file:
pcap_file = "pcap_files/Friday-WorkingHours.pcap"

In [8]:
i = 0
for item in PcapReader(pcap_file):
    i += 1
print(i)

9997874


In [9]:
#file_size and chunk_size are not being measured with the same units...
file_size = os.path.getsize(pcap_file)

In [10]:
if (os.path.getsize(pcap_file)/1_000_000_000) > 4:
    chunk_size = 2_500_000
else:
    chunk_size = 5_000_000

In [11]:
print(chunk_size)

2500000


In [12]:
file_size

8839309056

In [13]:
print(chunk_size)

2500000


In [14]:
pool = multiprocessing.Pool(4)

In [15]:
#Breaking data into chunks and applying function.
chunks = []
chunk = []
for packet in PcapReader(pcap_file):
    chunk.append(packet)
    if len(chunk) >= chunk_size:
        print(len(chunk))
        chunks.append(chunk)
        chunk = []
if chunk:
    chunks.append(chunk)
print(len(chunk))
print(len(chunks))
print(f"Available memory: {psutil.virtual_memory()[1]/1_000_000_000:.3f} GB")
results = pool.map(get_ioc_counts0, chunks)

KeyboardInterrupt: 

In [None]:
pool.close()
pool.join()

In [None]:
"""
Returning a list of three items (dns_counts, ip_counts, seq_counts).
Verified that the returned lists done in parallel are the same as if they were done in series.
"""
dns_counts = results[0][0]
ip_counts = results[0][1]
seq_counts = results[0][2]

In [None]:
print(f"DNS counts: {len(dns_counts)}")
print(f"IP counts: {len(ip_counts)}")
print(f"SEQ counts: {len(seq_counts)}")

In [None]:
#Establishes threshhold to flag packets
dns_threshold = set_threshold(dns_counts)
ip_threshold = set_threshold(ip_counts)
seq_threshold = set_threshold(seq_counts, sigma_value=4)

In [None]:
#Returns sets of packet attributes that exceed threshold
sus_dns = eval_packets(dns_threshold, dns_counts)
sus_ip = eval_packets(ip_threshold, ip_counts)
sus_seq = eval_packets(seq_threshold, seq_counts)

In [None]:
sus_packets = []

In [None]:
j = 0

for packet in PcapReader(pcap_file):
    #Add section to collect suspicious DNS packets
    
    #Add section to collect suspicious IP packets

    #Collect packets with suspicious TCP sequence numbers:
    if packet.haslayer(TCP) and (int(packet[TCP].seq) in sus_seq):
        sus_packets.append(packet)

    #Progress updates:
    j += 1
    if j % 1_000_000 == 0:
        print(f"{j} packets processed")

In [None]:
sus_df = packets_to_dataframe(sus_packets)

In [None]:
sus_df.iloc[0]

In [None]:
sus_df.Src_MAC.value_counts()

In [None]:
sus_df.Dst_MAC.value_counts()

In [None]:
sus_df.Src_IP.value_counts()

In [None]:
sus_df.Dst_IP.value_counts()

In [None]:
#View traffic from csv files
#mon0.head()
#tues0.head()
#wed0.head()
#thur0.head()
#thur1.head()
#fri0.head()
#fri1.head()
#fri2.head()

In [None]:
#tues0[tues0["Label"] == "FTP-Patator"].describe().T

In [None]:
#mon0["Label"].value_counts()

In [None]:
#tues0["Label"].value_counts()

In [None]:
#wed0["Label"].value_counts()

In [None]:
#thur0["Label"].value_counts()

In [None]:
#thur1["Label"].value_counts()

In [None]:
#fri0["Label"].value_counts()

In [None]:
#fri1["Label"].value_counts()

In [None]:
#fri2["Label"].value_counts()