# 3. Extract data from logs

### In data engineering, log files and log messages are very common. Sometimes you need to parse them to find valuable information, for example for debugging reasons.

Read in network.log and extract source IP, destination IP, protocol and data size.

In [5]:
import re
from collections import defaultdict

# Dictionary to store total bytes per protocol
protocol_totals = defaultdict(int)

# open and read the file 
with open("data/network.log", "r") as file:
    lines = file.readlines()

# Process each line
for line in lines:
    # Use regex to extract fields: Source IP, Destination IP, Protocol, Bytes
    match = re.search(r'Source: (\d+\.\d+\.\d+\.\d) .* Destination: (\d+\.\d+\.\d+\.\d+) .* Protocol: (\w+) .* Bytes: (\d+)', line)
    
    if match:
        source_ip = match.group(1)
        destination_ip = match.group(2)
        protocol = match.group(3)
        bytes_transferred = int(match.group(4))
        
        # Print each line in the desired format
        print(f"Source: {source_ip} | Destination: {destination_ip} | Protocol: {protocol} | Bytes: {bytes_transferred}")
        
        
        # Update totals for the protocol
        protocol_totals[protocol] += bytes_transferred
        
# print summary 
print("\nData Transfer summary:")
for protocol, total_bytes in protocol_totals.items():
    print(f"{protocol}: {total_bytes} bytes")


Source: 10.0.0.1 | Destination: 10.0.0.2 | Protocol: TCP | Bytes: 1024
Source: 10.0.0.2 | Destination: 10.0.0.3 | Protocol: UDP | Bytes: 2048
Source: 10.0.0.3 | Destination: 10.0.0.1 | Protocol: TCP | Bytes: 512

Data Transfer summary:
TCP: 1536 bytes
UDP: 2048 bytes
