# Exploratory Data Analysis - Cybersecurity Attacks Dataset

## Overview
This notebook provides a comprehensive exploratory data analysis of the cybersecurity attacks dataset.

## Objectives
1. Understand the dataset structure and characteristics
2. Identify missing values and data quality issues
3. Explore distributions of variables
4. Identify patterns and relationships
5. Generate insights for further analysis


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import missingno as msno
from datetime import datetime
import os

warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


## 1. Data Loading


In [None]:
# Load the dataset
df = pd.read_csv('../../data/Cybersecurity_attacks.csv')

print(f"Dataset Shape: {df.shape}")
print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]}")

# Display first few rows
df.head(10)


## 2. Data Overview and Missing Values Analysis


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percentage.values
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print("Missing Values Summary:")
print(missing_df)

# Visualize missing values
if len(missing_df) > 0:
    msno.matrix(df)
    plt.title('Missing Values Matrix')
    plt.tight_layout()
    plt.savefig('../../visualizations/missing_values_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()


## 3. Data Cleaning and Preprocessing


In [None]:
# Clean column names
df.columns = df.columns.str.strip()

# Remove the '.' column if it exists
if '.' in df.columns:
    df = df.drop(columns=['.'])

# Clean Attack category
if 'Attack category' in df.columns:
    df['Attack category'] = df['Attack category'].str.strip()

# Parse Time column
if 'Time' in df.columns:
    def parse_time(time_str):
        if pd.isna(time_str):
            return None, None
        try:
            if '-' in str(time_str):
                start, end = str(time_str).split('-')
                return int(start), int(end)
            else:
                return int(time_str), int(time_str)
        except:
            return None, None
    
    time_parsed = df['Time'].apply(parse_time)
    df['Time_Start'] = [t[0] for t in time_parsed]
    df['Time_End'] = [t[1] for t in time_parsed]
    df['Time_Duration'] = df['Time_End'] - df['Time_Start']
    
    # Convert to datetime
    df['Datetime_Start'] = pd.to_datetime(df['Time_Start'], unit='s', errors='coerce')
    df['Datetime_End'] = pd.to_datetime(df['Time_End'], unit='s', errors='coerce')
    
    # Extract temporal features
    df['Date'] = df['Datetime_Start'].dt.date
    df['Hour'] = df['Datetime_Start'].dt.hour
    df['DayOfWeek'] = df['Datetime_Start'].dt.day_name()
    df['Month'] = df['Datetime_Start'].dt.month
    
    print("Time column parsed successfully!")
    if 'Datetime_Start' in df.columns:
        print(f"Date range: {df['Datetime_Start'].min()} to {df['Datetime_Start'].max()}")


## 4. Categorical Variables Analysis


In [None]:
# Attack Category Distribution
if 'Attack category' in df.columns:
    attack_category_counts = df['Attack category'].value_counts()
    
    print("Attack Category Distribution:")
    print(attack_category_counts)
    print(f"\nTotal unique categories: {df['Attack category'].nunique()}")
    
    # Visualization
    plt.figure(figsize=(14, 8))
    attack_category_counts.plot(kind='bar')
    plt.title('Attack Category Distribution', fontsize=16, fontweight='bold')
    plt.xlabel('Attack Category', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('../../visualizations/attack_category_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Pie chart for top categories
    top_categories = attack_category_counts.head(10)
    plt.figure(figsize=(12, 8))
    plt.pie(top_categories.values, labels=top_categories.index, autopct='%1.1f%%', startangle=90)
    plt.title('Top 10 Attack Categories Distribution', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig('../../visualizations/attack_category_pie.png', dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
# Protocol Distribution
if 'Protocol' in df.columns:
    protocol_counts = df['Protocol'].value_counts().head(20)
    
    print("Top 20 Protocols:")
    print(protocol_counts)
    print(f"\nTotal unique protocols: {df['Protocol'].nunique()}")
    
    plt.figure(figsize=(14, 8))
    protocol_counts.plot(kind='bar')
    plt.title('Top 20 Protocol Distribution', fontsize=16, fontweight='bold')
    plt.xlabel('Protocol', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('../../visualizations/protocol_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()


## 5. Numerical Variables Analysis


In [None]:
# Port Analysis
if 'Source Port' in df.columns and 'Destination Port' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Source Port distribution
    df['Source Port'].hist(bins=50, ax=axes[0], edgecolor='black')
    axes[0].set_title('Source Port Distribution', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Source Port')
    axes[0].set_ylabel('Frequency')
    
    # Destination Port distribution
    df['Destination Port'].hist(bins=50, ax=axes[1], edgecolor='black')
    axes[1].set_title('Destination Port Distribution', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Destination Port')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.savefig('../../visualizations/port_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Port statistics
    print("Source Port Statistics:")
    print(df['Source Port'].describe())
    print("\nDestination Port Statistics:")
    print(df['Destination Port'].describe())
    
    # Common ports
    print("\nTop 10 Destination Ports:")
    print(df['Destination Port'].value_counts().head(10))


In [None]:
# Temporal Analysis
if 'Hour' in df.columns:
    # Hourly distribution
    hourly_counts = df['Hour'].value_counts().sort_index()
    
    plt.figure(figsize=(14, 6))
    hourly_counts.plot(kind='bar', color='steelblue', edgecolor='black')
    plt.title('Attack Distribution by Hour of Day', fontsize=16, fontweight='bold')
    plt.xlabel('Hour of Day', fontsize=12)
    plt.ylabel('Number of Attacks', fontsize=12)
    plt.xticks(rotation=0)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig('../../visualizations/attacks_by_hour.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Day of week distribution
    if 'DayOfWeek' in df.columns:
        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        day_counts = df['DayOfWeek'].value_counts()
        day_counts = day_counts.reindex([d for d in day_order if d in day_counts.index])
        
        plt.figure(figsize=(12, 6))
        day_counts.plot(kind='bar', color='coral', edgecolor='black')
        plt.title('Attack Distribution by Day of Week', fontsize=16, fontweight='bold')
        plt.xlabel('Day of Week', fontsize=12)
        plt.ylabel('Number of Attacks', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.savefig('../../visualizations/attacks_by_day.png', dpi=300, bbox_inches='tight')
        plt.show()


In [None]:
# IP Address Analysis
if 'Source IP' in df.columns and 'Destination IP' in df.columns:
    # Top source IPs
    top_source_ips = df['Source IP'].value_counts().head(20)
    
    plt.figure(figsize=(14, 8))
    top_source_ips.plot(kind='barh', color='teal')
    plt.title('Top 20 Source IPs by Attack Count', fontsize=16, fontweight='bold')
    plt.xlabel('Number of Attacks', fontsize=12)
    plt.ylabel('Source IP', fontsize=12)
    plt.tight_layout()
    plt.savefig('../../visualizations/top_source_ips.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Top destination IPs
    top_dest_ips = df['Destination IP'].value_counts().head(20)
    
    plt.figure(figsize=(14, 8))
    top_dest_ips.plot(kind='barh', color='orange')
    plt.title('Top 20 Destination IPs by Attack Count', fontsize=16, fontweight='bold')
    plt.xlabel('Number of Attacks', fontsize=12)
    plt.ylabel('Destination IP', fontsize=12)
    plt.tight_layout()
    plt.savefig('../../visualizations/top_destination_ips.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nUnique Source IPs: {df['Source IP'].nunique():,}")
    print(f"Unique Destination IPs: {df['Destination IP'].nunique():,}")


## 6. Summary Statistics and Insights


In [None]:
# Generate comprehensive summary
print("=" * 60)
print("COMPREHENSIVE DATASET SUMMARY")
print("=" * 60)

print(f"\n1. Dataset Overview:")
print(f"   - Total Records: {len(df):,}")
print(f"   - Total Columns: {len(df.columns)}")
if 'Datetime_Start' in df.columns:
    print(f"   - Date Range: {df['Datetime_Start'].min()} to {df['Datetime_Start'].max()}")

print(f"\n2. Attack Categories:")
if 'Attack category' in df.columns:
    print(f"   - Unique Categories: {df['Attack category'].nunique()}")
    for category, count in df['Attack category'].value_counts().head(5).items():
        print(f"   - {category}: {count:,} ({count/len(df)*100:.2f}%)")

print(f"\n3. Protocol Distribution:")
if 'Protocol' in df.columns:
    print(f"   - Unique Protocols: {df['Protocol'].nunique()}")
    for protocol, count in df['Protocol'].value_counts().head(5).items():
        print(f"   - {protocol}: {count:,} ({count/len(df)*100:.2f}%)")

print(f"\n4. Network Information:")
if 'Source IP' in df.columns and 'Destination IP' in df.columns:
    print(f"   - Unique Source IPs: {df['Source IP'].nunique():,}")
    print(f"   - Unique Destination IPs: {df['Destination IP'].nunique():,}")
if 'Destination Port' in df.columns:
    print(f"   - Most Common Destination Port: {df['Destination Port'].mode()[0]} ({df['Destination Port'].value_counts().iloc[0]:,} attacks)")

print(f"\n5. Temporal Patterns:")
if 'Hour' in df.columns:
    peak_hour = df['Hour'].mode()[0]
    print(f"   - Peak Attack Hour: {peak_hour}:00")
if 'DayOfWeek' in df.columns:
    peak_day = df['DayOfWeek'].mode()[0]
    print(f"   - Peak Attack Day: {peak_day}")

print(f"\n6. Data Quality:")
missing_values = df.isnull().sum()
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': (missing_values / len(df)) * 100
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
if len(missing_df) > 0:
    print("   Missing Values:")
    for idx, row in missing_df.iterrows():
        print(f"   - {row['Column']}: {row['Missing Count']:,} ({row['Missing Percentage']:.2f}%)")
else:
    print("   - No missing values found")

print("\n" + "=" * 60)
