## Network Data Analysis for Cybersecurity Threat Detection with Pandas

### This project centers on analyzing network traffic data to identify attacks and assess their severity.

## Installing and importing pandas

In [1]:
# Import pandas
import pandas as pd

In [2]:
# Load the dataset from the host site (GitHub) and assigning it to a dataframe (df)
df = pd.read_csv('https://raw.githubusercontent.com/ritaafrica/data/refs/heads/main/network_traffic_data.csv')

In [3]:
# Displaying the first five rows
df.head()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,TCP,,5411,8989,Blocked,Low
1,2025-03-19 13:03:40,192.168.1.13,172.217.169.46,ICMP,443.0,4999,11808,Allowed,Medium
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,HTTP,443.0,6360,10852,Allowed,Medium
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,TCP,,4011,14314,Blocked,Low
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,FTP,,5254,8718,Blocked,Medium


In [4]:
# Analyzing the structure of the dataset (number of rows and columns)
df.shape

(1000, 9)

In [5]:
# Extracting column headers
df.columns

Index(['Timestamp', 'Source_IP', 'Destination_IP', 'Protocol', 'Port',
       'Bytes_Sent', 'Bytes_Received', 'Status', 'Threat_Level'],
      dtype='object')

In [6]:
# Displaying basic info about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Timestamp       1000 non-null   object 
 1   Source_IP       1000 non-null   object 
 2   Destination_IP  1000 non-null   object 
 3   Protocol        1000 non-null   object 
 4   Port            874 non-null    float64
 5   Bytes_Sent      1000 non-null   int64  
 6   Bytes_Received  1000 non-null   int64  
 7   Status          1000 non-null   object 
 8   Threat_Level    1000 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 70.4+ KB
None


In [7]:
# Display the last five rows
df.tail()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
995,2025-03-19 04:46:40,10.0.0.46,172.217.169.46,DNS,53.0,2290,6246,Blocked,Low
996,2025-03-19 04:46:10,10.0.0.3,8.8.8.8,HTTP,443.0,3470,12474,Allowed,Low
997,2025-03-19 04:45:40,10.0.0.3,192.168.1.20,UDP,21.0,6655,13170,Blocked,Low
998,2025-03-19 04:45:10,192.168.1.30,172.217.169.46,DNS,,7308,13117,Blocked,Low
999,2025-03-19 04:44:40,192.168.1.34,8.8.8.8,ICMP,3389.0,726,279,Allowed,Low


In [8]:
# Displaying summary statistics
df.describe()

Unnamed: 0,Port,Bytes_Sent,Bytes_Received
count,874.0,1000.0,1000.0
mean,1819.73913,5143.572,7562.659
std,2899.374632,2808.256143,4240.206295
min,21.0,106.0,102.0
25%,22.0,2857.0,4025.5
50%,80.0,5224.0,7584.5
75%,3389.0,7487.75,11147.75
max,8080.0,9984.0,14977.0


In [9]:
# Extracting important columns: Timestamp, Source_IP, Destination_IP, and Status
selected_columns = df[["Timestamp", "Source_IP", "Destination_IP", "Status"]]

In [10]:
# Dispalying the selected columns
selected_columns

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Status
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,Blocked
1,2025-03-19 13:03:40,192.168.1.13,172.217.169.46,Allowed
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,Allowed
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,Blocked
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,Blocked
...,...,...,...,...
995,2025-03-19 04:46:40,10.0.0.46,172.217.169.46,Blocked
996,2025-03-19 04:46:10,10.0.0.3,8.8.8.8,Allowed
997,2025-03-19 04:45:40,10.0.0.3,192.168.1.20,Blocked
998,2025-03-19 04:45:10,192.168.1.30,172.217.169.46,Blocked


In [11]:
# Display the first five rows
selected_columns.head()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Status
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,Blocked
1,2025-03-19 13:03:40,192.168.1.13,172.217.169.46,Allowed
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,Allowed
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,Blocked
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,Blocked


In [12]:
# Selecting and displaying specific columns: Source_IP, Destination_IP, and Threat_Level
df[["Source_IP", "Destination_IP", "Threat_Level"]]

Unnamed: 0,Source_IP,Destination_IP,Threat_Level
0,10.0.0.15,192.168.1.20,Low
1,192.168.1.13,172.217.169.46,Medium
2,10.0.0.5,203.0.113.99,Medium
3,10.0.0.9,192.168.1.20,Low
4,192.168.1.4,172.217.169.46,Medium
...,...,...,...
995,10.0.0.46,172.217.169.46,Low
996,10.0.0.3,8.8.8.8,Low
997,10.0.0.3,192.168.1.20,Low
998,192.168.1.30,172.217.169.46,Low


In [13]:
# Extracting specific columns (Timestamp, Source_IP, Destination_IP, Status) and storing them in 'sd'
sd = df[["Timestamp", "Source_IP", "Destination_IP", "Status"]]

In [14]:
# Converting the "Source_IP" column to a pandas Series
source_ip_series = df["Source_IP"]

In [15]:
# Displaying the content of the 'source_ip_series' variable
source_ip_series

0         10.0.0.15
1      192.168.1.13
2          10.0.0.5
3          10.0.0.9
4       192.168.1.4
           ...     
995       10.0.0.46
996        10.0.0.3
997        10.0.0.3
998    192.168.1.30
999    192.168.1.34
Name: Source_IP, Length: 1000, dtype: object

## Selecting and filtering data from the Network Traffic Database

In [16]:
# Selecting blocked traffic
blocked_traffic = df[df["Status"] == "Blocked"]

In [17]:
# Extracting essential columns for analysis and storing them in 'blocked_summary'
blocked_summary = blocked_traffic[["Timestamp", "Source_IP", "Destination_IP", "Threat_Level", "Status"]]

In [18]:
# Displaying the first five rows of the blocked_summary DataFrame
blocked_summary.head()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Threat_Level,Status
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,Low,Blocked
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,Low,Blocked
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,Medium,Blocked
9,2025-03-19 12:59:40,10.0.0.43,10.0.0.5,Low,Blocked
10,2025-03-19 12:59:10,10.0.0.33,203.0.113.99,Medium,Blocked


In [19]:
# Filtering the dataset to include only rows where the Status is "Blocked"
blocked_traffic = df[df['Status'] == "Blocked"]

In [20]:
# Displaying the blocked traffic
blocked_traffic

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,TCP,,5411,8989,Blocked,Low
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,TCP,,4011,14314,Blocked,Low
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,FTP,,5254,8718,Blocked,Medium
9,2025-03-19 12:59:40,10.0.0.43,10.0.0.5,ICMP,3389.0,3305,6621,Blocked,Low
10,2025-03-19 12:59:10,10.0.0.33,203.0.113.99,UDP,3389.0,3700,11297,Blocked,Medium
...,...,...,...,...,...,...,...,...,...
992,2025-03-19 04:48:10,10.0.0.11,203.0.113.99,HTTP,,2839,2939,Blocked,Medium
993,2025-03-19 04:47:40,192.168.1.39,192.168.1.20,ICMP,22.0,4178,8307,Blocked,Low
995,2025-03-19 04:46:40,10.0.0.46,172.217.169.46,DNS,53.0,2290,6246,Blocked,Low
997,2025-03-19 04:45:40,10.0.0.3,192.168.1.20,UDP,21.0,6655,13170,Blocked,Low


In [21]:
# Displaying the number of rows and columns of the blocked traffic DataFrame 
blocked_traffic.shape

(532, 9)

In [22]:
# Filtering the blocked_traffic DataFrame to include key columns for analysis: Timestamp, Source_IP, Destination_IP, and Threat_Level
blocked_traffic = blocked_traffic[["Timestamp", "Source_IP", "Destination_IP", "Threat_Level"]]

In [23]:
# Filtering the dataset to include only rows with a Threat_Level of "High"
blocked_traffic = df[df["Threat_Level"] == "High"]

In [24]:
# Displaying the blocked_traffic
blocked_traffic

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
6,2025-03-19 13:01:10,10.0.0.26,10.0.0.5,ICMP,53.0,3431,2826,Allowed,High
19,2025-03-19 12:54:40,10.0.0.30,203.0.113.99,FTP,80.0,324,7694,Blocked,High
28,2025-03-19 12:50:10,192.168.1.12,8.8.8.8,HTTP,443.0,3757,11559,Allowed,High
35,2025-03-19 12:46:40,10.0.0.18,203.0.113.99,FTP,3389.0,3543,192,Blocked,High
44,2025-03-19 12:42:10,10.0.0.8,192.168.1.20,DNS,3389.0,7157,14676,Allowed,High
...,...,...,...,...,...,...,...,...,...
927,2025-03-19 05:20:40,10.0.0.8,203.0.113.99,TCP,8080.0,3821,14694,Blocked,High
932,2025-03-19 05:18:10,10.0.0.35,192.168.1.20,TCP,,7858,8169,Blocked,High
957,2025-03-19 05:05:40,10.0.0.3,10.0.0.5,HTTP,53.0,4995,7655,Blocked,High
977,2025-03-19 04:55:40,10.0.0.45,8.8.8.8,HTTP,,9221,11194,Blocked,High


In [25]:
# Filtering out suspicious traffic with the Threat Level as critical
high_risk_traffic = df[df["Threat_Level"] == "Critical"].head()

In [26]:
# Dispalying the high_risk_traffic DataFrame 
high_risk_traffic

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
59,2025-03-19 12:34:40,10.0.0.47,192.168.1.20,ICMP,,5885,463,Allowed,Critical
96,2025-03-19 12:16:10,192.168.1.35,203.0.113.99,FTP,8080.0,9371,7189,Allowed,Critical
134,2025-03-19 11:57:10,192.168.1.17,172.217.169.46,DNS,22.0,6714,13124,Blocked,Critical
150,2025-03-19 11:49:10,192.168.1.42,10.0.0.5,HTTP,53.0,2702,634,Allowed,Critical
209,2025-03-19 11:19:40,10.0.0.17,203.0.113.99,TCP,3389.0,5085,10014,Blocked,Critical


In [27]:
# Filtering the dataset to extract rows where Bytes_Sent is greater than 5000
high_data_transfer = df[df["Bytes_Sent"] > 5000]

In [28]:
# Displaying the result
high_data_transfer

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,TCP,,5411,8989,Blocked,Low
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,HTTP,443.0,6360,10852,Allowed,Medium
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,FTP,,5254,8718,Blocked,Medium
5,2025-03-19 13:01:40,10.0.0.43,172.217.169.46,DNS,53.0,6915,12981,Allowed,Low
7,2025-03-19 13:00:40,192.168.1.36,192.168.1.20,TCP,21.0,5655,119,Allowed,Medium
...,...,...,...,...,...,...,...,...,...
986,2025-03-19 04:51:10,192.168.1.8,203.0.113.99,FTP,3389.0,7565,1259,Blocked,Medium
988,2025-03-19 04:50:10,192.168.1.48,10.0.0.5,DNS,80.0,8548,13088,Allowed,High
990,2025-03-19 04:49:10,192.168.1.8,10.0.0.5,DNS,22.0,7759,5876,Allowed,Low
997,2025-03-19 04:45:40,10.0.0.3,192.168.1.20,UDP,21.0,6655,13170,Blocked,Low


## Splitting the dataset into X (Features) and y (Target)

In [29]:
# Extracting features (X) while excluding the target (y)
X = df.drop(columns = ["Threat_Level"])  # The .drop() removes specified rows or columns for this case (Threat_Level)

In [30]:
# Displaying the result of the Feature (X)
X

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,TCP,,5411,8989,Blocked
1,2025-03-19 13:03:40,192.168.1.13,172.217.169.46,ICMP,443.0,4999,11808,Allowed
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,HTTP,443.0,6360,10852,Allowed
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,TCP,,4011,14314,Blocked
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,FTP,,5254,8718,Blocked
...,...,...,...,...,...,...,...,...
995,2025-03-19 04:46:40,10.0.0.46,172.217.169.46,DNS,53.0,2290,6246,Blocked
996,2025-03-19 04:46:10,10.0.0.3,8.8.8.8,HTTP,443.0,3470,12474,Allowed
997,2025-03-19 04:45:40,10.0.0.3,192.168.1.20,UDP,21.0,6655,13170,Blocked
998,2025-03-19 04:45:10,192.168.1.30,172.217.169.46,DNS,,7308,13117,Blocked


In [31]:
# Extracting the target (y)
y = df["Threat_Level"]

In [32]:
# Displaying the result of the Target (y)
y

0         Low
1      Medium
2      Medium
3         Low
4      Medium
        ...  
995       Low
996       Low
997       Low
998       Low
999       Low
Name: Threat_Level, Length: 1000, dtype: object

## Removing a column

In [33]:
# Removing the "Bytes_Sent" column from the DataFrame
df = df.drop(columns = ["Bytes_Sent"])

In [34]:
# Displaying the first five rows of the DataFrame to verify its structure and contents
df.head()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Received,Status,Threat_Level
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,TCP,,8989,Blocked,Low
1,2025-03-19 13:03:40,192.168.1.13,172.217.169.46,ICMP,443.0,11808,Allowed,Medium
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,HTTP,443.0,10852,Allowed,Medium
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,TCP,,14314,Blocked,Low
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,FTP,,8718,Blocked,Medium
